diff --git a/packages/kokkos/.gitignore b/packages/kokkos/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..de4b66da3c35d938ad9e787feb80555821781f55
--- /dev/null
+++ b/packages/kokkos/.gitignore
@@ -0,0 +1,10 @@
+# Standard ignores
+*~
+*.pyc
+\#*#
+.#*
+.*.swp
+.cproject
+.project
+testing/
+.settings/
diff --git a/packages/kokkos/.gitrepo b/packages/kokkos/.gitrepo
new file mode 100644
index 0000000000000000000000000000000000000000..d5768a6febde5b5da70670403c08ca1fb24be98e
--- /dev/null
+++ b/packages/kokkos/.gitrepo
@@ -0,0 +1,11 @@
+; DO NOT EDIT (unless you know what you are doing)
+;
+; This subdirectory is a git "subrepo", and this file is maintained by the
+; git-subrepo command. See https://github.com/git-commands/git-subrepo#readme
+;
+[subrepo]
+	remote = git@github.com:kokkos/kokkos.git
+	branch = master
+	commit = e01945d0947f47e579468f325e4d97446453ad40
+	parent = 9e56a186c3c467c8807f077e8bb3beba0fffd6c3
+	cmdver = 0.3.1
diff --git a/packages/kokkos/.travis.yml b/packages/kokkos/.travis.yml
new file mode 100644
index 0000000000000000000000000000000000000000..734698c65cad58e1efa361dc244e912851a48a9b
--- /dev/null
+++ b/packages/kokkos/.travis.yml
@@ -0,0 +1,46 @@
+sudo: false
+
+language: cpp
+
+os:
+  - linux
+  - osx
+
+addons:
+  apt:
+    sources:
+      - ubuntu-toolchain-r-test
+    packages:
+      - cmake
+      - clang
+
+compiler:
+  - gcc
+  - clang
+
+env:
+  - THREADING="serial"
+  - THREADING="openmp"
+  - THREADING="pthread"
+
+# Apple GCC does not support OpenMP.  GCC with OpenMP requires Homebrew.
+# Apple Clang does not support OpenMP.  Clang with OpenMP requires Homebrew.
+# Clang OpenMP support is not always available.
+matrix:
+  exclude:
+    - compiler: clang
+      env: THREADING="openmp"
+    - os: osx
+      env: THREADING="openmp"
+    - os: osx
+      compiler: gcc
+
+script:
+  - export OMP_NUM_THREADS=2
+  - export OMP_PLACES=threads
+  - export OMP_PROC_BIND=threads
+  - mkdir build
+  - cd build
+  - ../generate_makefile.bash --compiler=$CXX --with-$THREADING --with-options=compiler_warnings
+  - make
+  - make test
diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md
new file mode 100644
index 0000000000000000000000000000000000000000..feb2bd547f034a5ef233667a3a9b5c3d0e02d3f8
--- /dev/null
+++ b/packages/kokkos/CHANGELOG.md
@@ -0,0 +1,568 @@
+# Change Log
+
+## [2.6.00](https://github.com/kokkos/kokkos/tree/2.6.00) (2018-03-07)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.5.00...2.6.00)
+
+**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.6**      
+
+**Implemented enhancements:**
+
+- Support NVIDIA Volta microarchitecture [\#1466](https://github.com/kokkos/kokkos/issues/1466)
+- Kokkos - Define empty functions when profiling disabled [\#1424](https://github.com/kokkos/kokkos/issues/1424)
+- Don't use \_\_constant\_\_ cache for lock arrays, enable once per run update instead of once per call [\#1385](https://github.com/kokkos/kokkos/issues/1385)
+- task dag enhancement. [\#1354](https://github.com/kokkos/kokkos/issues/1354)
+- Cuda task team collectives and stack size [\#1353](https://github.com/kokkos/kokkos/issues/1353)
+- Replace View operator acceptance of more than rank integers with 'access' function [\#1333](https://github.com/kokkos/kokkos/issues/1333)
+- Interoperability: Do not shut down backend execution space runtimes upon calling finalize. [\#1305](https://github.com/kokkos/kokkos/issues/1305)
+- shmem\_size for LayoutStride [\#1291](https://github.com/kokkos/kokkos/issues/1291)
+- Kokkos::resize performs poorly on 1D Views [\#1270](https://github.com/kokkos/kokkos/issues/1270)
+- stride\(\) is inconsistent with dimension\(\), extent\(\), etc. [\#1214](https://github.com/kokkos/kokkos/issues/1214)
+- Kokkos::sort defaults to std::sort on host [\#1208](https://github.com/kokkos/kokkos/issues/1208)
+- DynamicView with host size grow [\#1206](https://github.com/kokkos/kokkos/issues/1206)
+- Unmanaged View with Anonymous Memory Space [\#1175](https://github.com/kokkos/kokkos/issues/1175)
+- Sort subset of Kokkos::DynamicView [\#1160](https://github.com/kokkos/kokkos/issues/1160)
+- MDRange policy doesn't support lambda reductions [\#1054](https://github.com/kokkos/kokkos/issues/1054)
+- Add ability to set hook on Kokkos::finalize [\#714](https://github.com/kokkos/kokkos/issues/714)
+- Atomics with Serial Backend - Default should be Disable? [\#549](https://github.com/kokkos/kokkos/issues/549)
+- KOKKOS\_ENABLE\_DEPRECATED\_CODE [\#1359](https://github.com/kokkos/kokkos/issues/1359)
+
+**Fixed bugs:**
+
+- cuda\_internal\_maximum\_warp\_count returns 8, but I believe it should return 16 for P100  [\#1269](https://github.com/kokkos/kokkos/issues/1269)
+- Cuda: level 1 scratch memory bug \(reported by Stan Moore\) [\#1434](https://github.com/kokkos/kokkos/issues/1434)
+- MDRangePolicy Reduction requires value\_type typedef in Functor [\#1379](https://github.com/kokkos/kokkos/issues/1379)
+- Kokkos DeepCopy between empty views fails [\#1369](https://github.com/kokkos/kokkos/issues/1369)
+- Several issues with new CMake build infrastructure \(reported by Eric Phipps\) [\#1365](https://github.com/kokkos/kokkos/issues/1365)
+- deep\_copy between rank-1 host/device views of differing layouts without UVM no longer works \(reported by Eric Phipps\) [\#1363](https://github.com/kokkos/kokkos/issues/1363)
+- Profiling can't be disabled in CMake, and a parallel\_for is missing for tasks \(reported by Kyungjoo Kim\) [\#1349](https://github.com/kokkos/kokkos/issues/1349)
+- get\_work\_partition int overflow \(reported by berryj5\) [\#1327](https://github.com/kokkos/kokkos/issues/1327)
+- Kokkos::deep\_copy must fence even if the two views are the same [\#1303](https://github.com/kokkos/kokkos/issues/1303)
+- CudaUVMSpace::allocate/deallocate must fence [\#1302](https://github.com/kokkos/kokkos/issues/1302)
+- ViewResize on CUDA fails in Debug because of too many resources requested [\#1299](https://github.com/kokkos/kokkos/issues/1299)
+- Cuda 9 and intrepid2 calls from Panzer. [\#1183](https://github.com/kokkos/kokkos/issues/1183)
+- Slowdown due to tracking\_enabled\(\) in 2.04.00 \(found by Albany app\) [\#1016](https://github.com/kokkos/kokkos/issues/1016)
+- Bounds checking fails with zero-span Views \(reported by Stan Moore\) [\#1411](https://github.com/kokkos/kokkos/issues/1411)
+
+
+## [2.5.00](https://github.com/kokkos/kokkos/tree/2.5.00) (2017-12-15)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.11...2.5.00)
+
+**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.5**      
+
+**Implemented enhancements:**
+
+- Provide Makefile.kokkos logic for CMake and TriBITS [\#878](https://github.com/kokkos/kokkos/issues/878)
+- Add Scatter View [\#825](https://github.com/kokkos/kokkos/issues/825)
+- Drop gcc 4.7 and intel 14 from supported compiler list [\#603](https://github.com/kokkos/kokkos/issues/603)
+- Enable construction of unmanaged view using common\_view\_alloc\_prop [\#1170](https://github.com/kokkos/kokkos/issues/1170)
+- Unused Function Warning with XL [\#1267](https://github.com/kokkos/kokkos/issues/1267)
+- Add memory pool parameter check [\#1218](https://github.com/kokkos/kokkos/issues/1218)
+- CUDA9: Fix warning for unsupported long double [\#1189](https://github.com/kokkos/kokkos/issues/1189)
+- CUDA9: fix warning on defaulted function marking [\#1188](https://github.com/kokkos/kokkos/issues/1188)
+- CUDA9: fix warnings for deprecated warp level functions [\#1187](https://github.com/kokkos/kokkos/issues/1187)
+- Add CUDA 9.0 nightly testing [\#1174](https://github.com/kokkos/kokkos/issues/1174)
+- {OMPI,MPICH}\_CXX hack breaks nvcc\_wrapper use case [\#1166](https://github.com/kokkos/kokkos/issues/1166)
+- KOKKOS\_HAVE\_CUDA\_LAMBDA became KOKKOS\_CUDA\_USE\_LAMBDA [\#1274](https://github.com/kokkos/kokkos/issues/1274)
+
+**Fixed bugs:**
+
+- MinMax Reducer with tagged operator doesn't compile [\#1251](https://github.com/kokkos/kokkos/issues/1251)
+- Reducers for Tagged operators give wrong answer [\#1250](https://github.com/kokkos/kokkos/issues/1250)
+- Kokkos not Compatible with Big Endian Machines? [\#1235](https://github.com/kokkos/kokkos/issues/1235)
+- Parallel Scan hangs forever on BG/Q [\#1234](https://github.com/kokkos/kokkos/issues/1234)
+- Threads backend doesn't compile with Clang on OS X [\#1232](https://github.com/kokkos/kokkos/issues/1232)
+- $\(shell date\) needs quote [\#1264](https://github.com/kokkos/kokkos/issues/1264)
+- Unqualified parallel\_for call conflicts with user-defined parallel\_for [\#1219](https://github.com/kokkos/kokkos/issues/1219)
+- KokkosAlgorithms: CMake issue in unit tests [\#1212](https://github.com/kokkos/kokkos/issues/1212)
+- Intel 18 Error: "simd pragma has been deprecated" [\#1210](https://github.com/kokkos/kokkos/issues/1210)
+- Memory leak in Kokkos::initialize [\#1194](https://github.com/kokkos/kokkos/issues/1194)
+- CUDA9: compiler error with static assert template arguments [\#1190](https://github.com/kokkos/kokkos/issues/1190)
+- Kokkos::Serial::is\_initialized returns always true [\#1184](https://github.com/kokkos/kokkos/issues/1184)
+- Triple nested parallelism still fails on bowman [\#1093](https://github.com/kokkos/kokkos/issues/1093)
+- OpenMP openmp.range on Develop Runs Forever on POWER7+ with RHEL7 and GCC4.8.5 [\#995](https://github.com/kokkos/kokkos/issues/995)
+- Rendezvous performance at global scope [\#985](https://github.com/kokkos/kokkos/issues/985)
+
+
+## [2.04.11](https://github.com/kokkos/kokkos/tree/2.04.11) (2017-10-28)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.04...2.04.11)
+
+**Implemented enhancements:**
+
+- Add Subview pattern. [\#648](https://github.com/kokkos/kokkos/issues/648)
+- Add Kokkos "global" is\_initialized [\#1060](https://github.com/kokkos/kokkos/issues/1060)
+- Add create\_mirror\_view\_and\_copy [\#1161](https://github.com/kokkos/kokkos/issues/1161)
+- Add KokkosConcepts SpaceAccessibility function [\#1092](https://github.com/kokkos/kokkos/issues/1092)
+- Option to Disable Initialize Warnings [\#1142](https://github.com/kokkos/kokkos/issues/1142)
+- Mature task-DAG capability [\#320](https://github.com/kokkos/kokkos/issues/320)
+- Promote Work DAG from experimental [\#1126](https://github.com/kokkos/kokkos/issues/1126)
+- Implement new WorkGraph push/pop [\#1108](https://github.com/kokkos/kokkos/issues/1108)
+- Kokkos\_ENABLE\_Cuda\_Lambda should default ON [\#1101](https://github.com/kokkos/kokkos/issues/1101)
+- Add multidimensional parallel for example and improve unit test [\#1064](https://github.com/kokkos/kokkos/issues/1064)
+- Fix ROCm:  Performance tests not building [\#1038](https://github.com/kokkos/kokkos/issues/1038)
+- Make KOKKOS\_ALIGN\_SIZE a configure-time option [\#1004](https://github.com/kokkos/kokkos/issues/1004)
+- Make alignment consistent [\#809](https://github.com/kokkos/kokkos/issues/809)
+- Improve subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615)
+
+**Fixed bugs:**
+
+- Kokkos::vector fixes for application [\#1134](https://github.com/kokkos/kokkos/issues/1134)
+- DynamicView non-power of two value\_type [\#1177](https://github.com/kokkos/kokkos/issues/1177)
+- Memory pool bug [\#1154](https://github.com/kokkos/kokkos/issues/1154)
+- Cuda launch bounds performance regression bug [\#1140](https://github.com/kokkos/kokkos/issues/1140)
+- Significant performance regression in LAMMPS after updating Kokkos [\#1139](https://github.com/kokkos/kokkos/issues/1139)
+- CUDA compile error [\#1128](https://github.com/kokkos/kokkos/issues/1128)
+- MDRangePolicy neg idx test failure in debug mode [\#1113](https://github.com/kokkos/kokkos/issues/1113)
+- subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615)
+
+## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)
+
+**Implemented enhancements:**
+
+- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082)
+- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071)
+- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052)
+- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019)
+- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952)
+- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857)
+
+**Fixed bugs:**
+
+- Fix reduction\_identity\<T\>::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048)
+- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041)
+- (Experimental) HBWSpace  Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094)
+- (Experimental) ROCm:  algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070)
+
+## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)
+
+**Implemented enhancements:**
+
+- Added ROCm backend to support AMD GPUs
+- Kokkos::complex\<T\> behaves slightly differently from std::complex\<T\> [\#1011](https://github.com/kokkos/kokkos/issues/1011)
+- Kokkos::Experimental::Crs constructor arguments were in the wrong order [\#992](https://github.com/kokkos/kokkos/issues/992)
+- Work graph construction ease-of-use (one lambda for count and fill) [\#991](https://github.com/kokkos/kokkos/issues/991)
+- when\_all returns pointer of futures (improved interface) [\#990](https://github.com/kokkos/kokkos/issues/990)
+- Allow assignment of LayoutLeft to LayoutRight or vice versa for rank-0 Views [\#594](https://github.com/kokkos/kokkos/issues/594)
+- Changed the meaning of Kokkos\_ENABLE\_CXX11\_DISPATCH\_LAMBDA [\#1035](https://github.com/kokkos/kokkos/issues/1035)
+
+**Fixed bugs:**
+
+- memory pool default constructor does not properly set member variables. [\#1007](https://github.com/kokkos/kokkos/issues/1007)
+
+## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13)
+
+**Implemented enhancements:**
+
+- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406)
+- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630)
+- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898)
+- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904)
+- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737)
+- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890)
+- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843)
+- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842)
+- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870)
+- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824)
+- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853)
+- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852)
+- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771)
+- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716)
+- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668)
+- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566)
+- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214)
+
+**Fixed bugs:**
+
+- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975)
+- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941)
+- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940)
+- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939)
+- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917)
+- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863)
+- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862)
+- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860)
+- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829)
+- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826)
+- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776)
+- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767)
+- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758)
+- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670)
+- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560)
+
+
+## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05)
+
+**Implemented enhancements:**
+
+- Harmonize Custom Reductions over nesting levels [\#802](https://github.com/kokkos/kokkos/issues/802)
+- Prevent users directly including KokkosCore\_config.h [\#815](https://github.com/kokkos/kokkos/issues/815)
+- DualView aborts on concurrent host/device modify \(in debug mode\) [\#814](https://github.com/kokkos/kokkos/issues/814)
+- Abort when running on a NVIDIA CC5.0 or higher architecture with code compiled for CC \< 5.0 [\#813](https://github.com/kokkos/kokkos/issues/813)
+- Add "name" function to ExecSpaces [\#806](https://github.com/kokkos/kokkos/issues/806)
+- Allow null Future in task spawn dependences [\#795](https://github.com/kokkos/kokkos/issues/795)
+- Add Unit Tests for Kokkos::complex [\#785](https://github.com/kokkos/kokkos/issues/785)
+- Add pow function for Kokkos::complex [\#784](https://github.com/kokkos/kokkos/issues/784)
+- Square root of a complex [\#729](https://github.com/kokkos/kokkos/issues/729)
+- Command line processing of --threads argument prevents users from having any commandline arguments starting with --threads [\#760](https://github.com/kokkos/kokkos/issues/760)
+- Protected deprecated API with appropriate macro [\#756](https://github.com/kokkos/kokkos/issues/756)
+- Allow task scheduler memory pool to be used by tasks [\#747](https://github.com/kokkos/kokkos/issues/747)
+- View bounds checking on host-side performance: constructing a std::string [\#723](https://github.com/kokkos/kokkos/issues/723)
+- Add check for AppleClang as compiler distinct from check for Clang. [\#705](https://github.com/kokkos/kokkos/issues/705)
+- Uninclude source files for specific configurations to prevent link warning. [\#701](https://github.com/kokkos/kokkos/issues/701)
+- Add --small option to snapshot script [\#697](https://github.com/kokkos/kokkos/issues/697)
+- CMake Standalone Support [\#674](https://github.com/kokkos/kokkos/issues/674)
+- CMake build unit test and install [\#808](https://github.com/kokkos/kokkos/issues/808)
+- CMake: Fix having kokkos as a subdirectory in a pure cmake project [\#629](https://github.com/kokkos/kokkos/issues/629)
+- Tribits macro assumes build directory is in top level source directory [\#654](https://github.com/kokkos/kokkos/issues/654)
+- Use bin/nvcc\_wrapper, not config/nvcc\_wrapper [\#562](https://github.com/kokkos/kokkos/issues/562)
+- Allow MemoryPool::allocate\(\) to be called from multiple threads per warp. [\#487](https://github.com/kokkos/kokkos/issues/487)
+- Allow MemoryPool::allocate\\(\\) to be called from multiple threads per warp. [\#487](https://github.com/kokkos/kokkos/issues/487)
+- Move OpenMP 4.5 OpenMPTarget backend into Develop [\#456](https://github.com/kokkos/kokkos/issues/456)
+- Testing on ARM testbed [\#288](https://github.com/kokkos/kokkos/issues/288)
+
+**Fixed bugs:**
+
+- Fix label in OpenMP parallel\_reduce verify\_initialized [\#834](https://github.com/kokkos/kokkos/issues/834)
+- TeamScratch Level 1 on Cuda hangs [\#820](https://github.com/kokkos/kokkos/issues/820)
+- \[bug\] memory pool. [\#786](https://github.com/kokkos/kokkos/issues/786)
+- Some Reduction Tests fail on Intel 18 with aggressive vectorization on [\#774](https://github.com/kokkos/kokkos/issues/774)
+- Error copying dynamic view on copy of memory pool [\#773](https://github.com/kokkos/kokkos/issues/773)
+- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758)
+- ThreadVectorRange Customized Reduction Bug [\#739](https://github.com/kokkos/kokkos/issues/739)
+- set\_scratch\_size overflows  [\#726](https://github.com/kokkos/kokkos/issues/726)
+- Get wrong results for compiler checks in Makefile on OS X. [\#706](https://github.com/kokkos/kokkos/issues/706)
+- Fix check if multiple host architectures enabled. [\#702](https://github.com/kokkos/kokkos/issues/702)
+- Threads Backend Does not Pass on Cray Compilers [\#609](https://github.com/kokkos/kokkos/issues/609)
+- Rare bug in memory pool where allocation can finish on superblock in empty state [\#452](https://github.com/kokkos/kokkos/issues/452)
+- LDFLAGS in core/unit\_test/Makefile: potential "undefined reference" to pthread lib [\#148](https://github.com/kokkos/kokkos/issues/148)
+
+## [2.03.00](https://github.com/kokkos/kokkos/tree/2.03.00) (2017-04-25)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.15...2.03.00)
+
+**Implemented enhancements:**
+
+- UnorderedMap: make it accept Devices or MemorySpaces [\#711](https://github.com/kokkos/kokkos/issues/711)
+- sort to accept DynamicView and \[begin,end\) indices [\#691](https://github.com/kokkos/kokkos/issues/691)
+- ENABLE Macros should only be used via \#ifdef or \#if defined [\#675](https://github.com/kokkos/kokkos/issues/675)
+- Remove impl/Kokkos\_Synchronic\_\* [\#666](https://github.com/kokkos/kokkos/issues/666)
+- Turning off IVDEP for Intel 14.  [\#638](https://github.com/kokkos/kokkos/issues/638)
+- Using an installed Kokkos in a target application using CMake [\#633](https://github.com/kokkos/kokkos/issues/633)
+- Create Kokkos Bill of Materials [\#632](https://github.com/kokkos/kokkos/issues/632)
+- MDRangePolicy and tagged evaluators [\#547](https://github.com/kokkos/kokkos/issues/547)
+- Add PGI support [\#289](https://github.com/kokkos/kokkos/issues/289)
+
+**Fixed bugs:**
+
+- Output from PerTeam fails [\#733](https://github.com/kokkos/kokkos/issues/733)
+- Cuda: architecture flag not added to link line [\#688](https://github.com/kokkos/kokkos/issues/688)
+- Getting large chunks of memory for a thread team in a universal way [\#664](https://github.com/kokkos/kokkos/issues/664)
+- Kokkos RNG normal\(\) function hangs for small seed value [\#655](https://github.com/kokkos/kokkos/issues/655)
+- Kokkos Tests Errors on Shepard/HSW Builds [\#644](https://github.com/kokkos/kokkos/issues/644)
+
+## [2.02.15](https://github.com/kokkos/kokkos/tree/2.02.15) (2017-02-10)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.07...2.02.15)
+
+**Implemented enhancements:**
+
+- Containers: Adding block partitioning to StaticCrsGraph [\#625](https://github.com/kokkos/kokkos/issues/625)
+- Kokkos Make System can induce Errors on Cray Volta System [\#610](https://github.com/kokkos/kokkos/issues/610)
+- OpenMP: error out if KOKKOS\_HAVE\_OPENMP is defined but not \_OPENMP [\#605](https://github.com/kokkos/kokkos/issues/605)
+- CMake: fix standalone build with tests [\#604](https://github.com/kokkos/kokkos/issues/604)
+- Change README \(that GitHub shows when opening Kokkos project page\) to tell users how to submit PRs [\#597](https://github.com/kokkos/kokkos/issues/597)
+- Add correctness testing for all operators of Atomic View [\#420](https://github.com/kokkos/kokkos/issues/420)
+- Allow assignment of Views with compatible memory spaces [\#290](https://github.com/kokkos/kokkos/issues/290)
+- Build only one version of Kokkos library for tests [\#213](https://github.com/kokkos/kokkos/issues/213)
+- Clean out old KOKKOS\_HAVE\_CXX11 macros clauses [\#156](https://github.com/kokkos/kokkos/issues/156)
+- Harmonize Macro names [\#150](https://github.com/kokkos/kokkos/issues/150)
+
+**Fixed bugs:**
+
+- Cray and PGI: Kokkos\_Parallel\_Reduce [\#634](https://github.com/kokkos/kokkos/issues/634)
+- Kokkos Make System can induce Errors on Cray Volta System [\#610](https://github.com/kokkos/kokkos/issues/610)
+- Normal\(\) function random number generator doesn't give the expected distribution [\#592](https://github.com/kokkos/kokkos/issues/592)
+
+## [2.02.07](https://github.com/kokkos/kokkos/tree/2.02.07) (2016-12-16)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.01...2.02.07)
+
+**Implemented enhancements:**
+
+- Add CMake option to enable Cuda Lambda support [\#589](https://github.com/kokkos/kokkos/issues/589)
+- Add CMake option to enable Cuda RDC support [\#588](https://github.com/kokkos/kokkos/issues/588)
+- Add Initial Intel Sky Lake Xeon-HPC Compiler Support to Kokkos Make System [\#584](https://github.com/kokkos/kokkos/issues/584)
+- Building Tutorial Examples  [\#582](https://github.com/kokkos/kokkos/issues/582)
+- Internal way for using ThreadVectorRange without TeamHandle  [\#574](https://github.com/kokkos/kokkos/issues/574)
+- Testing: Add testing for uvm and rdc [\#571](https://github.com/kokkos/kokkos/issues/571)
+- Profiling: Add Memory Tracing and Region Markers [\#557](https://github.com/kokkos/kokkos/issues/557)
+- nvcc\_wrapper not installed with Kokkos built with CUDA through CMake [\#543](https://github.com/kokkos/kokkos/issues/543)
+- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541)
+- Benchmarks: Add Gather benchmark [\#536](https://github.com/kokkos/kokkos/issues/536)
+- Testing: add spot\_check option to test\_all\_sandia [\#535](https://github.com/kokkos/kokkos/issues/535)
+- Deprecate Kokkos::Impl::VerifyExecutionCanAccessMemorySpace [\#527](https://github.com/kokkos/kokkos/issues/527)
+- Add AtomicAdd support for 64bit float for Pascal [\#522](https://github.com/kokkos/kokkos/issues/522)
+- Add Restrict and Aligned memory trait [\#517](https://github.com/kokkos/kokkos/issues/517)
+- Kokkos Tests are Not Run using Compiler Optimization [\#501](https://github.com/kokkos/kokkos/issues/501)
+- Add support for clang 3.7 w/ openmp backend [\#393](https://github.com/kokkos/kokkos/issues/393)
+- Provide an error throw class [\#79](https://github.com/kokkos/kokkos/issues/79)
+
+**Fixed bugs:**
+
+- Cuda UVM Allocation test broken with UVM as default space [\#586](https://github.com/kokkos/kokkos/issues/586)
+- Bug \(develop branch only\): multiple tests are now failing when forcing uvm usage. [\#570](https://github.com/kokkos/kokkos/issues/570)
+- Error in generate\_makefile.sh for Kokkos when Compiler is Empty String/Fails [\#568](https://github.com/kokkos/kokkos/issues/568)
+- XL 13.1.4 incorrect C++11 flag [\#553](https://github.com/kokkos/kokkos/issues/553)
+- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541)
+- Installing Library on MAC broken due to cp -u [\#539](https://github.com/kokkos/kokkos/issues/539)
+- Intel Nightly Testing with Debug enabled fails [\#534](https://github.com/kokkos/kokkos/issues/534)
+
+## [2.02.01](https://github.com/kokkos/kokkos/tree/2.02.01) (2016-11-01)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.00...2.02.01)
+
+**Implemented enhancements:**
+
+- Add Changelog generation to our process. [\#506](https://github.com/kokkos/kokkos/issues/506)
+
+**Fixed bugs:**
+
+- Test scratch\_request fails in Serial with Debug enabled [\#520](https://github.com/kokkos/kokkos/issues/520)
+- Bug In BoundsCheck for DynRankView [\#516](https://github.com/kokkos/kokkos/issues/516)
+
+## [2.02.00](https://github.com/kokkos/kokkos/tree/2.02.00) (2016-10-30)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.10...2.02.00)
+
+**Implemented enhancements:**
+
+- Add PowerPC assembly for grabbing clock register in memory pool [\#511](https://github.com/kokkos/kokkos/issues/511)
+- Add GCC 6.x support [\#508](https://github.com/kokkos/kokkos/issues/508)
+- Test install and build against installed library [\#498](https://github.com/kokkos/kokkos/issues/498)
+- Makefile.kokkos adds expt-extended-lambda to cuda build with clang [\#490](https://github.com/kokkos/kokkos/issues/490)
+- Add top-level makefile option to just test kokkos-core unit-test [\#485](https://github.com/kokkos/kokkos/issues/485)
+- Split and harmonize Object Files of Core UnitTests to increase build parallelism [\#484](https://github.com/kokkos/kokkos/issues/484)
+- LayoutLeft to LayoutLeft subview for 3D and 4D views [\#473](https://github.com/kokkos/kokkos/issues/473)
+- Add official Cuda 8.0 support [\#468](https://github.com/kokkos/kokkos/issues/468)
+- Allow C++1Z Flag for Class Lambda capture [\#465](https://github.com/kokkos/kokkos/issues/465)
+- Add Clang 4.0+ compilation of Cuda code [\#455](https://github.com/kokkos/kokkos/issues/455)
+- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445)
+- Add name of view to "View bounds error" [\#432](https://github.com/kokkos/kokkos/issues/432)
+- Move Sort Binning Operators into Kokkos namespace [\#421](https://github.com/kokkos/kokkos/issues/421)
+- TaskPolicy - generate error when attempt to use uninitialized  [\#396](https://github.com/kokkos/kokkos/issues/396)
+- Import WithoutInitializing and AllowPadding into Kokkos namespace [\#325](https://github.com/kokkos/kokkos/issues/325)
+- TeamThreadRange requires begin, end to be the same type [\#305](https://github.com/kokkos/kokkos/issues/305)
+- CudaUVMSpace should track \# allocations, due to CUDA limit on \# UVM allocations [\#300](https://github.com/kokkos/kokkos/issues/300)
+- Remove old View and its infrastructure [\#259](https://github.com/kokkos/kokkos/issues/259)
+
+**Fixed bugs:**
+
+- Bug in TestCuda\_Other.cpp: most likely assembly inserted into Device code [\#515](https://github.com/kokkos/kokkos/issues/515)
+- Cuda Compute Capability check of GPU is outdated [\#509](https://github.com/kokkos/kokkos/issues/509)
+- multi\_scratch test with hwloc and pthreads seg-faults.  [\#504](https://github.com/kokkos/kokkos/issues/504)
+- generate\_makefile.bash: "make install" is broken [\#503](https://github.com/kokkos/kokkos/issues/503)
+- make clean in Out of Source Build/Tests Does Not Work Correctly [\#502](https://github.com/kokkos/kokkos/issues/502)
+- Makefiles for test and examples have issues in Cuda when CXX is not explicitly specified [\#497](https://github.com/kokkos/kokkos/issues/497)
+- Dispatch lambda test directly inside GTEST macro doesn't work with nvcc [\#491](https://github.com/kokkos/kokkos/issues/491)
+- UnitTests with HWLOC enabled fail if run with mpirun bound to a single core [\#489](https://github.com/kokkos/kokkos/issues/489)
+- Failing Reducer Test on Mac with Pthreads [\#479](https://github.com/kokkos/kokkos/issues/479)
+- make test Dumps Error with Clang Not Found [\#471](https://github.com/kokkos/kokkos/issues/471)
+- OpenMP TeamPolicy member broadcast not using correct volatile shared variable [\#424](https://github.com/kokkos/kokkos/issues/424)
+- TaskPolicy - generate error when attempt to use uninitialized  [\#396](https://github.com/kokkos/kokkos/issues/396)
+- New task policy implementation is pulling in old experimental code. [\#372](https://github.com/kokkos/kokkos/issues/372)
+- MemoryPool unit test hangs on Power8 with GCC 6.1.0 [\#298](https://github.com/kokkos/kokkos/issues/298)
+
+## [2.01.10](https://github.com/kokkos/kokkos/tree/2.01.10) (2016-09-27)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.06...2.01.10)
+
+**Implemented enhancements:**
+
+- Enable Profiling by default in Tribits build [\#438](https://github.com/kokkos/kokkos/issues/438)
+- parallel\_reduce\(0\), parallel\_scan\(0\) unit tests [\#436](https://github.com/kokkos/kokkos/issues/436)
+- data\(\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351)
+- Fix tutorials to track new Kokkos::View [\#323](https://github.com/kokkos/kokkos/issues/323)
+- Rename team policy set\_scratch\_size. [\#195](https://github.com/kokkos/kokkos/issues/195)
+
+**Fixed bugs:**
+
+- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445)
+- Makefile spits syntax error [\#435](https://github.com/kokkos/kokkos/issues/435)
+- Kokkos::sort fails for view with all the same values [\#422](https://github.com/kokkos/kokkos/issues/422)
+- Generic Reducers: can't accept inline constructed reducer [\#404](https://github.com/kokkos/kokkos/issues/404)
+- data\\(\\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351)
+- const subview of const view with compile time dimensions on Cuda backend [\#310](https://github.com/kokkos/kokkos/issues/310)
+- Kokkos \(in Trilinos\) Causes Internal Compiler Error on CUDA 8.0.21-EA on POWER8 [\#307](https://github.com/kokkos/kokkos/issues/307)
+- Core Oversubscription Detection Broken? [\#159](https://github.com/kokkos/kokkos/issues/159)
+
+
+## [2.01.06](https://github.com/kokkos/kokkos/tree/2.01.06) (2016-09-02)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.00...2.01.06)
+
+**Implemented enhancements:**
+
+- Add "standard" reducers for lambda-supportable customized reduce [\#411](https://github.com/kokkos/kokkos/issues/411)
+- TaskPolicy - single thread back-end execution [\#390](https://github.com/kokkos/kokkos/issues/390)
+- Kokkos master clone tag [\#387](https://github.com/kokkos/kokkos/issues/387)
+- Query memory requirements from task policy [\#378](https://github.com/kokkos/kokkos/issues/378)
+- Output order of test\_atomic.cpp is confusing [\#373](https://github.com/kokkos/kokkos/issues/373)
+- Missing testing for atomics [\#341](https://github.com/kokkos/kokkos/issues/341)
+- Feature request for Kokkos to provide Kokkos::atomic\_fetch\_max and atomic\_fetch\_min [\#336](https://github.com/kokkos/kokkos/issues/336)
+- TaskPolicy\<Cuda\> performance requires teams mapped to warps [\#218](https://github.com/kokkos/kokkos/issues/218)
+
+**Fixed bugs:**
+
+- Reduce with Teams broken for custom initialize [\#407](https://github.com/kokkos/kokkos/issues/407)
+- Failing Kokkos build on Debian [\#402](https://github.com/kokkos/kokkos/issues/402)
+- Failing Tests on NVIDIA Pascal GPUs [\#398](https://github.com/kokkos/kokkos/issues/398)
+- Algorithms: fill\_random assumes dimensions fit in unsigned int [\#389](https://github.com/kokkos/kokkos/issues/389)
+- Kokkos::subview with RandomAccess Memory Trait [\#385](https://github.com/kokkos/kokkos/issues/385)
+- Build warning \(signed / unsigned comparison\) in Cuda implementation [\#365](https://github.com/kokkos/kokkos/issues/365)
+- wrong results for a parallel\_reduce with CUDA8 / Maxwell50 [\#352](https://github.com/kokkos/kokkos/issues/352)
+- Hierarchical parallelism - 3 level unit test [\#344](https://github.com/kokkos/kokkos/issues/344)
+- Can I allocate a View w/ both WithoutInitializing & AllowPadding? [\#324](https://github.com/kokkos/kokkos/issues/324)
+- subview View layout determination [\#309](https://github.com/kokkos/kokkos/issues/309)
+- Unit tests with Cuda - Maxwell [\#196](https://github.com/kokkos/kokkos/issues/196)
+
+## [2.01.00](https://github.com/kokkos/kokkos/tree/2.01.00) (2016-07-21)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/End_C++98...2.01.00)
+
+**Implemented enhancements:**
+
+- Edit ViewMapping so assigning Views with the same custom layout compiles when const casting [\#327](https://github.com/kokkos/kokkos/issues/327)
+- DynRankView: Performance improvement for operator\(\) [\#321](https://github.com/kokkos/kokkos/issues/321)
+- Interoperability between static and dynamic rank views [\#295](https://github.com/kokkos/kokkos/issues/295)
+- subview member function ? [\#280](https://github.com/kokkos/kokkos/issues/280)
+- Inter-operatibility between View and DynRankView. [\#245](https://github.com/kokkos/kokkos/issues/245)
+- \(Trilinos\) build warning in atomic\_assign, with Kokkos::complex [\#177](https://github.com/kokkos/kokkos/issues/177)
+- View\<\>::shmem\_size should runtime check for number of arguments equal to rank [\#176](https://github.com/kokkos/kokkos/issues/176)
+- Custom reduction join via lambda argument [\#99](https://github.com/kokkos/kokkos/issues/99)
+- DynRankView with 0 dimensions passed in at construction [\#293](https://github.com/kokkos/kokkos/issues/293)
+- Inject view\_alloc and friends into Kokkos namespace [\#292](https://github.com/kokkos/kokkos/issues/292)
+- Less restrictive TeamPolicy reduction on Cuda [\#286](https://github.com/kokkos/kokkos/issues/286)
+- deep\_copy using remap with source execution space [\#267](https://github.com/kokkos/kokkos/issues/267)
+- Suggestion:  Enable opt-in L1 caching via nvcc-wrapper [\#261](https://github.com/kokkos/kokkos/issues/261)
+- More flexible create\_mirror functions [\#260](https://github.com/kokkos/kokkos/issues/260)
+- Rename View::memory\_span to View::required\_allocation\_size [\#256](https://github.com/kokkos/kokkos/issues/256)
+- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237)
+- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237)
+- Kokkos::Timer [\#234](https://github.com/kokkos/kokkos/issues/234)
+- Fence CudaUVMSpace allocations [\#230](https://github.com/kokkos/kokkos/issues/230)
+- View::operator\(\) accept std::is\_integral and std::is\_enum [\#227](https://github.com/kokkos/kokkos/issues/227)
+- Allocating zero size View [\#216](https://github.com/kokkos/kokkos/issues/216)
+- Thread scalable memory pool [\#212](https://github.com/kokkos/kokkos/issues/212)
+- Add a way to disable memory leak output [\#194](https://github.com/kokkos/kokkos/issues/194)
+- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192)
+- Runtime rank wrapper for View [\#189](https://github.com/kokkos/kokkos/issues/189)
+- Profiling Interface [\#158](https://github.com/kokkos/kokkos/issues/158)
+- Fix View assignment \(of managed to unmanaged\) [\#153](https://github.com/kokkos/kokkos/issues/153)
+- Add unit test for assignment of managed View to unmanaged View [\#152](https://github.com/kokkos/kokkos/issues/152)
+- Check for oversubscription of threads with MPI in Kokkos::initialize [\#149](https://github.com/kokkos/kokkos/issues/149)
+- Dynamic resizeable 1dimensional view [\#143](https://github.com/kokkos/kokkos/issues/143)
+- Develop TaskPolicy for CUDA [\#142](https://github.com/kokkos/kokkos/issues/142)
+- New View : Test Compilation Downstream [\#138](https://github.com/kokkos/kokkos/issues/138)
+- New View Implementation [\#135](https://github.com/kokkos/kokkos/issues/135)
+- Add variant of subview that lets users add traits [\#134](https://github.com/kokkos/kokkos/issues/134)
+- NVCC-WRAPPER: Add --host-only flag [\#121](https://github.com/kokkos/kokkos/issues/121)
+- Address gtest issue with TriBITS Kokkos build outside of Trilinos [\#117](https://github.com/kokkos/kokkos/issues/117)
+- Make tests pass with -expt-extended-lambda on CUDA [\#108](https://github.com/kokkos/kokkos/issues/108)
+- Dynamic scheduling for parallel\_for and parallel\_reduce [\#106](https://github.com/kokkos/kokkos/issues/106)
+- Runtime or compile time error when reduce functor's join is not properly specified as const member function or with volatile arguments [\#105](https://github.com/kokkos/kokkos/issues/105)
+- Error out when the number of threads is modified after kokkos is initialized [\#104](https://github.com/kokkos/kokkos/issues/104)
+- Porting to POWER and remove assumption of X86 default [\#103](https://github.com/kokkos/kokkos/issues/103)
+- Dynamic scheduling option for RangePolicy [\#100](https://github.com/kokkos/kokkos/issues/100)
+- SharedMemory Support for Lambdas [\#81](https://github.com/kokkos/kokkos/issues/81)
+- Recommended TeamSize for Lambdas [\#80](https://github.com/kokkos/kokkos/issues/80)
+- Add Aggressive Vectorization Compilation mode [\#72](https://github.com/kokkos/kokkos/issues/72)
+- Dynamic scheduling team execution policy [\#53](https://github.com/kokkos/kokkos/issues/53)
+- UVM allocations in multi-GPU systems [\#50](https://github.com/kokkos/kokkos/issues/50)
+- Synchronic in Kokkos::Impl [\#44](https://github.com/kokkos/kokkos/issues/44)
+- index and dimension types in for loops [\#28](https://github.com/kokkos/kokkos/issues/28)
+- Subview assign of 1D Strided with stride 1 to LayoutLeft/Right [\#1](https://github.com/kokkos/kokkos/issues/1)
+
+**Fixed bugs:**
+
+- misspelled variable name in Kokkos\_Atomic\_Fetch + missing unit tests [\#340](https://github.com/kokkos/kokkos/issues/340)
+- seg fault Kokkos::Impl::CudaInternal::print\_configuration [\#338](https://github.com/kokkos/kokkos/issues/338)
+- Clang compiler error with named parallel\_reduce, tags, and TeamPolicy. [\#335](https://github.com/kokkos/kokkos/issues/335)
+- Shared Memory Allocation Error at parallel\_reduce [\#311](https://github.com/kokkos/kokkos/issues/311)
+- DynRankView: Fix resize and realloc [\#303](https://github.com/kokkos/kokkos/issues/303)
+- Scratch memory and dynamic scheduling [\#279](https://github.com/kokkos/kokkos/issues/279)
+- MemoryPool infinite loop when out of memory [\#312](https://github.com/kokkos/kokkos/issues/312)
+- Kokkos DynRankView changes break Sacado and Panzer [\#299](https://github.com/kokkos/kokkos/issues/299)
+- MemoryPool fails to compile on non-cuda non-x86 [\#297](https://github.com/kokkos/kokkos/issues/297)
+- Random Number Generator Fix [\#296](https://github.com/kokkos/kokkos/issues/296)
+- View template parameter ordering Bug [\#282](https://github.com/kokkos/kokkos/issues/282)
+- Serial task policy broken. [\#281](https://github.com/kokkos/kokkos/issues/281)
+- deep\_copy with LayoutStride should not memcpy [\#262](https://github.com/kokkos/kokkos/issues/262)
+- DualView::need\_sync should be a const method [\#248](https://github.com/kokkos/kokkos/issues/248)
+- Arbitrary-sized atomics on GPUs broken; loop forever [\#238](https://github.com/kokkos/kokkos/issues/238)
+- boolean reduction value\_type changes answer [\#225](https://github.com/kokkos/kokkos/issues/225)
+- Custom init\(\) function for parallel\_reduce with array value\_type [\#210](https://github.com/kokkos/kokkos/issues/210)
+- unit\_test Makefile is Broken - Recursively Calls itself until Machine Apocalypse. [\#202](https://github.com/kokkos/kokkos/issues/202)
+- nvcc\_wrapper Does Not Support  -Xcompiler \<compiler option\> [\#198](https://github.com/kokkos/kokkos/issues/198)
+- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192)
+- Kokkos Threads Backend impl\_shared\_alloc Broken on Intel 16.1 \(Shepard Haswell\) [\#186](https://github.com/kokkos/kokkos/issues/186)
+- pthread back end hangs if used uninitialized [\#182](https://github.com/kokkos/kokkos/issues/182)
+- parallel\_reduce of size 0, not calling init/join [\#175](https://github.com/kokkos/kokkos/issues/175)
+- Bug in Threads with OpenMP enabled [\#173](https://github.com/kokkos/kokkos/issues/173)
+- KokkosExp\_SharedAlloc, m\_team\_work\_index inaccessible [\#166](https://github.com/kokkos/kokkos/issues/166)
+- 128-bit CAS without Assembly Broken? [\#161](https://github.com/kokkos/kokkos/issues/161)
+- fatal error: Cuda/Kokkos\_Cuda\_abort.hpp: No such file or directory [\#157](https://github.com/kokkos/kokkos/issues/157)
+- Power8: Fix OpenMP backend [\#139](https://github.com/kokkos/kokkos/issues/139)
+- Data race in Kokkos OpenMP initialization [\#131](https://github.com/kokkos/kokkos/issues/131)
+- parallel\_launch\_local\_memory and cuda 7.5 [\#125](https://github.com/kokkos/kokkos/issues/125)
+- Resize can fail with Cuda due to asynchronous dispatch [\#119](https://github.com/kokkos/kokkos/issues/119)
+- Qthread taskpolicy initialization bug. [\#92](https://github.com/kokkos/kokkos/issues/92)
+- Windows: sys/mman.h [\#89](https://github.com/kokkos/kokkos/issues/89)
+- Windows: atomic\_fetch\_sub\(\) [\#88](https://github.com/kokkos/kokkos/issues/88)
+- Windows: snprintf [\#87](https://github.com/kokkos/kokkos/issues/87)
+- Parallel\_Reduce with TeamPolicy and league size of 0 returns garbage [\#85](https://github.com/kokkos/kokkos/issues/85)
+- Throw with Cuda when using \(2D\) team\_policy parallel\_reduce with less than a warp size [\#76](https://github.com/kokkos/kokkos/issues/76)
+- Scalar views don't work with Kokkos::Atomic memory trait [\#69](https://github.com/kokkos/kokkos/issues/69)
+- Reduce the number of threads per team for Cuda [\#63](https://github.com/kokkos/kokkos/issues/63)
+- Named Kernels fail for reductions with CUDA [\#60](https://github.com/kokkos/kokkos/issues/60)
+- Kokkos View dimension\_\(\) for long returning unsigned int [\#20](https://github.com/kokkos/kokkos/issues/20)
+- atomic test hangs with LLVM [\#6](https://github.com/kokkos/kokkos/issues/6)
+- OpenMP Test should set omp\_set\_num\_threads to 1 [\#4](https://github.com/kokkos/kokkos/issues/4)
+
+**Closed issues:**
+
+- develop branch broken with CUDA 8 and --expt-extended-lambda  [\#354](https://github.com/kokkos/kokkos/issues/354)
+- --arch=KNL with Intel 2016 build failure [\#349](https://github.com/kokkos/kokkos/issues/349)
+- Error building with Cuda when passing -DKOKKOS\_CUDA\_USE\_LAMBDA to generate\_makefile.bash [\#343](https://github.com/kokkos/kokkos/issues/343)
+- Can I safely use int indices in a 2-D View with capacity \> 2B? [\#318](https://github.com/kokkos/kokkos/issues/318)
+- Kokkos::ViewAllocateWithoutInitializing is not working [\#317](https://github.com/kokkos/kokkos/issues/317)
+- Intel build on Mac OS X [\#277](https://github.com/kokkos/kokkos/issues/277)
+- deleted [\#271](https://github.com/kokkos/kokkos/issues/271)
+- Broken Mira build [\#268](https://github.com/kokkos/kokkos/issues/268)
+- 32-bit build [\#246](https://github.com/kokkos/kokkos/issues/246)
+- parallel\_reduce with RDC crashes linker [\#232](https://github.com/kokkos/kokkos/issues/232)
+- build of Kokkos\_Sparse\_MV\_impl\_spmv\_Serial.cpp.o fails if you use nvcc and have cuda disabled [\#209](https://github.com/kokkos/kokkos/issues/209)
+- Kokkos Serial execution space is not tested with TeamPolicy. [\#207](https://github.com/kokkos/kokkos/issues/207)
+- Unit test failure on Hansen  KokkosCore\_UnitTest\_Cuda\_MPI\_1 [\#200](https://github.com/kokkos/kokkos/issues/200)
+- nvcc compiler warning: calling a \_\_host\_\_ function from a \_\_host\_\_ \_\_device\_\_ function is not allowed [\#180](https://github.com/kokkos/kokkos/issues/180)
+- Intel 15 build error with defaulted "move" operators [\#171](https://github.com/kokkos/kokkos/issues/171)
+- missing libkokkos.a during Trilinos 12.4.2 build, yet other libkokkos\*.a libs are there [\#165](https://github.com/kokkos/kokkos/issues/165)
+- Tie atomic updates to execution space or even to thread team? \(speculation\) [\#144](https://github.com/kokkos/kokkos/issues/144)
+- New View: Compiletime/size Test [\#137](https://github.com/kokkos/kokkos/issues/137)
+- New View : Performance Test [\#136](https://github.com/kokkos/kokkos/issues/136)
+- Signed/unsigned  comparison warning in CUDA parallel [\#130](https://github.com/kokkos/kokkos/issues/130)
+- Kokkos::complex: Need op\* w/ std::complex & real [\#126](https://github.com/kokkos/kokkos/issues/126)
+- Use uintptr\_t for casting pointers [\#110](https://github.com/kokkos/kokkos/issues/110)
+- Default thread mapping behavior between P and Q threads. [\#91](https://github.com/kokkos/kokkos/issues/91)
+- Windows: Atomic\_Fetch\_Exchange\(\) return type [\#90](https://github.com/kokkos/kokkos/issues/90)
+- Synchronic unit test is way too long [\#84](https://github.com/kokkos/kokkos/issues/84)
+- nvcc\_wrapper -\> $\(NVCC\_WRAPPER\) [\#42](https://github.com/kokkos/kokkos/issues/42)
+- Check compiler version and print helpful message [\#39](https://github.com/kokkos/kokkos/issues/39)
+- Kokkos shared memory on Cuda uses a lot of registers [\#31](https://github.com/kokkos/kokkos/issues/31)
+- Can not pass unit test `cuda.space` without a GT 720 [\#25](https://github.com/kokkos/kokkos/issues/25)
+- Makefile.kokkos lacks bounds checking option that CMake has [\#24](https://github.com/kokkos/kokkos/issues/24)
+- Kokkos can not complete unit tests with CUDA UVM enabled [\#23](https://github.com/kokkos/kokkos/issues/23)
+- Simplify teams + shared memory histogram example to remove vectorization [\#21](https://github.com/kokkos/kokkos/issues/21)
+- Kokkos needs to rever to ${PROJECT\_NAME}\_ENABLE\_CXX11 not Trilinos\_ENABLE\_CXX11 [\#17](https://github.com/kokkos/kokkos/issues/17)
+- Kokkos Base Makefile adds AVX to KNC Build [\#16](https://github.com/kokkos/kokkos/issues/16)
+- MS Visual Studio 2013 Build Errors [\#9](https://github.com/kokkos/kokkos/issues/9)
+- subview\(X, ALL\(\), j\) for 2-D LayoutRight View X: should it view a column? [\#5](https://github.com/kokkos/kokkos/issues/5)
+
+## [End_C++98](https://github.com/kokkos/kokkos/tree/End_C++98) (2015-04-15)
+
+
+\* *This Change Log was automatically generated by [github_changelog_generator](https://github.com/skywinder/Github-Changelog-Generator)*
diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cd1f4ea981339a5ff01095be9bca144cdb5d9064
--- /dev/null
+++ b/packages/kokkos/CMakeLists.txt
@@ -0,0 +1,137 @@
+# Is this a build as part of Trilinos?
+
+IF(COMMAND TRIBITS_PACKAGE_DECL)
+  SET(KOKKOS_HAS_TRILINOS ON CACHE BOOL "")
+ELSE()
+  SET(KOKKOS_HAS_TRILINOS OFF CACHE BOOL "")
+ENDIF()
+
+IF(NOT KOKKOS_HAS_TRILINOS)
+  cmake_minimum_required(VERSION 3.3 FATAL_ERROR)
+
+  # Define Project Name if this is a standalone build
+  IF(NOT DEFINED ${PROJECT_NAME})
+    project(Kokkos CXX) 
+  ENDIF()
+
+  # Basic initialization (Used in KOKKOS_SETTINGS)
+  set(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR})
+  set(KOKKOS_PATH ${KOKKOS_SRC_PATH})
+
+  #------------ COMPILER AND FEATURE CHECKS ------------------------------------
+  include(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake)
+  set_kokkos_cxx_compiler()
+  set_kokkos_cxx_standard()
+  
+  #------------ GET OPTIONS AND KOKKOS_SETTINGS --------------------------------
+  # Add Kokkos' modules to CMake's module path.
+  set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/")
+
+  set(KOKKOS_CMAKE_VERBOSE True)
+  include(${KOKKOS_SRC_PATH}/cmake/kokkos_options.cmake)
+
+  include(${KOKKOS_SRC_PATH}/cmake/kokkos_settings.cmake)
+
+  #------------ GENERATE HEADER AND SOURCE FILES -------------------------------
+  execute_process(
+    COMMAND ${KOKKOS_SETTINGS} make -f ${KOKKOS_SRC_PATH}/cmake/Makefile.generate_cmake_settings CXX=${CMAKE_CXX_COMPILER} generate_build_settings
+    WORKING_DIRECTORY "${Kokkos_BINARY_DIR}"
+    OUTPUT_FILE ${Kokkos_BINARY_DIR}/core_src_make.out
+    RESULT_VARIABLE GEN_SETTINGS_RESULT
+  )
+  if (GEN_SETTINGS_RESULT)
+    message(FATAL_ERROR "Kokkos settings generation failed:\n"
+        "${KOKKOS_SETTINGS} make -f ${KOKKOS_SRC_PATH}/cmake/Makefile.generate_cmake_settings CXX=${CMAKE_CXX_COMPILER} generate_build_settings")
+  endif()
+  include(${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake)
+  string(REPLACE " " ";" KOKKOS_TPL_INCLUDE_DIRS "${KOKKOS_GMAKE_TPL_INCLUDE_DIRS}")
+  string(REPLACE " " ";" KOKKOS_TPL_LIBRARY_DIRS "${KOKKOS_GMAKE_TPL_LIBRARY_DIRS}")
+  string(REPLACE " " ";" KOKKOS_TPL_LIBRARY_NAMES "${KOKKOS_GMAKE_TPL_LIBRARY_NAMES}")
+  list(REMOVE_ITEM KOKKOS_TPL_INCLUDE_DIRS "")
+  list(REMOVE_ITEM KOKKOS_TPL_LIBRARY_DIRS "")
+  list(REMOVE_ITEM KOKKOS_TPL_LIBRARY_NAMES "")
+  set_kokkos_srcs(KOKKOS_SRC ${KOKKOS_SRC})
+
+  #------------ NOW BUILD ------------------------------------------------------
+  include(${KOKKOS_SRC_PATH}/cmake/kokkos_build.cmake)
+
+  #------------ Add in Fake Tribits Handling to allow unit test builds- --------
+
+  include(${KOKKOS_SRC_PATH}/cmake/tribits.cmake)
+
+  TRIBITS_PACKAGE_DECL(Kokkos)
+
+  ADD_SUBDIRECTORY(core)
+  ADD_SUBDIRECTORY(containers)
+  ADD_SUBDIRECTORY(algorithms)
+
+ELSE()
+#------------------------------------------------------------------------------
+#
+# A) Forward declare the package so that certain options are also defined for
+# subpackages
+#
+
+TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
+
+
+#------------------------------------------------------------------------------
+#
+# B) Install Kokkos' build files
+#
+# If using the Makefile-generated files, then need to set things up.
+# Here, assume that TriBITS has been run from ProjectCompilerPostConfig.cmake
+# and already generated KokkosCore_config.h and kokkos_generated_settings.cmake
+# in the previously define Kokkos_GEN_DIR
+# We need to copy them over to the correct place and source the cmake file
+
+if(NOT KOKKOS_LEGACY_TRIBITS)
+  set(Kokkos_GEN_DIR ${CMAKE_BINARY_DIR})
+  file(COPY "${Kokkos_GEN_DIR}/KokkosCore_config.h"
+    DESTINATION "${CMAKE_CURRENT_BINARY_DIR}" USE_SOURCE_PERMISSIONS)
+  install(FILES "${Kokkos_GEN_DIR}/KokkosCore_config.h"
+    DESTINATION include)
+  file(COPY "${Kokkos_GEN_DIR}/kokkos_generated_settings.cmake"
+    DESTINATION "${CMAKE_CURRENT_BINARY_DIR}" USE_SOURCE_PERMISSIONS)
+
+  include(${CMAKE_CURRENT_BINARY_DIR}/kokkos_generated_settings.cmake)
+  # Sources come from makefile-generated kokkos_generated_settings.cmake file
+  # Enable using the individual sources if needed
+  set_kokkos_srcs(KOKKOS_SRC ${KOKKOS_SRC})
+endif ()
+
+
+#------------------------------------------------------------------------------
+#
+# C) Install Kokkos' executable scripts
+#
+
+# nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler.
+# Kokkos needs nvcc_wrapper in order to build.  Other libraries and
+# executables also need nvcc_wrapper.  Thus, we need to install it.
+# If the argument of DESTINATION is a relative path, CMake computes it
+# as relative to ${CMAKE_INSTALL_PATH}.
+
+INSTALL(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper DESTINATION bin)
+
+
+#------------------------------------------------------------------------------
+#
+# D) Process the subpackages for Kokkos
+#
+
+TRIBITS_PROCESS_SUBPACKAGES()
+
+
+#------------------------------------------------------------------------------
+#
+# E) If Kokkos itself is enabled, process the Kokkos package
+#
+
+TRIBITS_PACKAGE_DEF()
+
+TRIBITS_EXCLUDE_AUTOTOOLS_FILES()
+
+TRIBITS_PACKAGE_POSTPROCESS()
+
+ENDIF()
diff --git a/packages/kokkos/Copyright.txt b/packages/kokkos/Copyright.txt
new file mode 100644
index 0000000000000000000000000000000000000000..50b76995af47381395ba4b9d0ad72ac7f57e4655
--- /dev/null
+++ b/packages/kokkos/Copyright.txt
@@ -0,0 +1,40 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
diff --git a/packages/kokkos/HOW_TO_SNAPSHOT b/packages/kokkos/HOW_TO_SNAPSHOT
new file mode 100644
index 0000000000000000000000000000000000000000..ad3f78efb4f8dd8399e3fb2889def7e841b531f9
--- /dev/null
+++ b/packages/kokkos/HOW_TO_SNAPSHOT
@@ -0,0 +1,73 @@
+
+Developers of Kokkos (those who commit modifications to Kokkos)
+must maintain the snapshot of Kokkos in the Trilinos repository.
+
+This file contains instructions for how to
+snapshot Kokkos from github.com/kokkos to Trilinos.
+
+------------------------------------------------------------------------
+*** EVERYTHING GOES RIGHT WORKFLOW ***
+
+1) Given a 'git clone' of Kokkos and of Trilinos repositories.
+1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone.
+     This path *must* terminate with the directory name 'kokkos';
+     e.g., ${HOME}/kokkos .
+1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory.
+
+2) Given that the Kokkos build & test is clean and
+   changes are committed to the Kokkos clone.
+
+3) Snapshot the current commit in the Kokkos clone into the Trilinos clone.
+   This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}:
+	${KOKKOS}/scripts/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages
+
+4) Verify the snapshot commit happened as expected
+	cd ${TRILINOS}/packages/kokkos
+	git log -1 --name-only
+
+5) Modify, build, and test Trilinos with the Kokkos snapshot.
+
+6) Given that that the Trilinos build & test is clean and
+   changes are committed to the Trilinos clone.
+
+7) Attempt push to the Kokkos repository.
+   If push fails then you must 'remove the Kokkos snapshot'
+   from your Trilinos clone.
+   See below.
+
+8) Attempt to push to the Trilinos repository.
+   If updating for a failed push requires you to change Kokkos you must
+   'remove the Kokkos snapshot' from your Trilinos clone.
+   See below.
+
+------------------------------------------------------------------------
+*** WHEN SOMETHING GOES WRONG AND YOU MUST              ***
+*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE ***
+
+1) Query the Trilinos clone commit log.
+	git log --oneline
+
+2) Note the <SHA1> of the commit to the Trillinos clone
+   immediately BEFORE the Kokkos snapshot commit.
+   Copy this <SHA1> for use in the next command.
+
+3) IF more than one outstanding commit then you can remove just the
+   Kokkos snapshot commit with 'git rebase -i'.  Edit the rebase file.
+   Remove or comment out the Kokkos snapshot commit entry.
+	git rebase -i <SHA1>
+
+4) IF the Kokkos snapshot commit is the one and only
+   outstanding commit then remove just than commit.
+	git reset --hard HEAD~1
+
+------------------------------------------------------------------------
+*** REGARDING 'snapshot.py' TOOL ***
+
+The 'snapshot.py' tool is developed and maintained by the
+Center for Computing Research (CCR)
+Software Engineering, Maintenance, and Support (SEMS) team.
+
+Contact Brent Perschbacher <bmpersc@sandia.gov> for questions>
+
+------------------------------------------------------------------------
+
diff --git a/packages/kokkos/LICENSE b/packages/kokkos/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..c68a8a2a9f98983e6bb7e02aaae0ce705a241ffc
--- /dev/null
+++ b/packages/kokkos/LICENSE
@@ -0,0 +1,42 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Kokkos is licensed under 3-clause BSD terms of use:
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos
new file mode 100644
index 0000000000000000000000000000000000000000..9eff81784a21b8be14ee8b5e14e13cae0a6cb406
--- /dev/null
+++ b/packages/kokkos/Makefile.kokkos
@@ -0,0 +1,1045 @@
+# Default settings common options.
+
+# Options: Cuda,ROCm,OpenMP,Pthread,Qthreads,Serial
+#KOKKOS_DEVICES ?= "OpenMP"
+KOKKOS_DEVICES ?= "Pthread"
+# Options: 
+# Intel:    KNC,KNL,SNB,HSW,BDW,SKX
+# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72
+# ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2
+# IBM:      BGQ,Power7,Power8,Power9
+# AMD-GPUS: Kaveri,Carrizo,Fiji,Vega
+# AMD-CPUS: AMDAVX,Ryzen,Epyc
+KOKKOS_ARCH ?= ""
+# Options: yes,no
+KOKKOS_DEBUG ?= "no"
+# Options: hwloc,librt,experimental_memkind
+KOKKOS_USE_TPLS ?= ""
+# Options: c++11,c++1z
+KOKKOS_CXX_STANDARD ?= "c++11"
+# Options: aggressive_vectorization,disable_profiling,disable_deprecated_code
+KOKKOS_OPTIONS ?= ""
+
+# Default settings specific options.
+# Options: force_uvm,use_ldg,rdc,enable_lambda
+KOKKOS_CUDA_OPTIONS ?= ""
+
+# Return a 1 if a string contains a substring and 0 if not
+# Note the search string should be without '"'
+# Example: $(call kokkos_has_string,"hwloc,librt",hwloc)
+#   Will return a 1
+kokkos_has_string=$(if $(findstring $2,$1),1,0)
+
+# Check for general settings.
+KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes)
+KOKKOS_INTERNAL_ENABLE_CXX11 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++11)
+KOKKOS_INTERNAL_ENABLE_CXX1Z := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1z)
+
+# Check for external libraries.
+KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc)
+KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt)
+KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind)
+
+# Check for advanced settings.
+KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings)
+KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
+KOKKOS_INTERNAL_DISABLE_PROFILING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_profiling)
+KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
+KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
+KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_profile_load_print)
+KOKKOS_INTERNAL_CUDA_USE_LDG := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),use_ldg)
+KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),force_uvm)
+KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc)
+KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
+
+
+# Check for Kokkos Host Execution Spaces one of which must be on.
+KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP)
+KOKKOS_INTERNAL_USE_PTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Pthread)
+KOKKOS_INTERNAL_USE_QTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Qthreads)
+KOKKOS_INTERNAL_USE_SERIAL := $(call kokkos_has_string,$(KOKKOS_DEVICES),Serial)
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
+  ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
+    ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 0)
+      KOKKOS_INTERNAL_USE_SERIAL := 1
+    endif
+  endif
+endif
+
+# Check for other Execution Spaces.
+KOKKOS_INTERNAL_USE_CUDA := $(call kokkos_has_string,$(KOKKOS_DEVICES),Cuda)
+KOKKOS_INTERNAL_USE_ROCM := $(call kokkos_has_string,$(KOKKOS_DEVICES),ROCm)
+KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),OpenMPTarget)
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
+  CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
+  KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .)
+endif
+
+# Check OS.
+KOKKOS_OS                      := $(strip $(shell uname -s))
+KOKKOS_INTERNAL_OS_CYGWIN      := $(call kokkos_has_string,$(KOKKOS_OS),CYGWIN)
+KOKKOS_INTERNAL_OS_LINUX       := $(call kokkos_has_string,$(KOKKOS_OS),Linux)
+KOKKOS_INTERNAL_OS_DARWIN      := $(call kokkos_has_string,$(KOKKOS_OS),Darwin)
+
+# Check compiler.
+KOKKOS_CXX_VERSION                   := $(strip $(shell $(CXX) --version       2>&1))
+KOKKOS_INTERNAL_COMPILER_INTEL       := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Intel Corporation)
+KOKKOS_INTERNAL_COMPILER_PGI         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),PGI)
+KOKKOS_INTERNAL_COMPILER_XL          := $(strip $(shell $(CXX) -qversion       2>&1 | grep XL                  | wc -l))
+KOKKOS_INTERNAL_COMPILER_CRAY        := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-"               | wc -l))
+KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep nvcc | wc -l))
+KOKKOS_INTERNAL_COMPILER_CLANG       := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
+KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),apple-darwin)
+KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
+
+ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
+  KOKKOS_INTERNAL_COMPILER_CLANG = 1
+endif
+ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 2)
+  KOKKOS_INTERNAL_COMPILER_XL = 1
+endif
+
+# Apple Clang passes both clang and apple clang tests, so turn off clang.
+ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
+  KOKKOS_INTERNAL_COMPILER_CLANG = 0
+endif
+# AMD HCC passes both clang and hcc test so turn off clang
+ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
+  KOKKOS_INTENAL_COMPILER_CLANG = 0
+endif
+
+ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+  KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell clang --version | grep version | cut -d ' ' -f3 | tr -d '.')
+
+  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+    ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0)
+      $(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher)
+    endif
+
+    KOKKOS_INTERNAL_CUDA_USE_LAMBDA := 1
+  endif
+endif
+
+# Set compiler warnings flags.
+ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+    # TODO check if PGI accepts GNU style warnings
+    KOKKOS_INTERNAL_COMPILER_WARNINGS =
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+      KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
+        KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+      else
+        ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
+          KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+        else
+          ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+            # TODO check if cray accepts GNU style warnings
+            KOKKOS_INTERNAL_COMPILER_WARNINGS =
+          else
+            #gcc
+            KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
+          endif
+        endif
+      endif
+    endif
+  endif
+else
+  KOKKOS_INTERNAL_COMPILER_WARNINGS =
+endif
+
+# Set OpenMP flags.
+ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+  KOKKOS_INTERNAL_OPENMP_FLAG := -mp
+else
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
+      KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
+        KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
+      else
+        ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+          # OpenMP is turned on by default in Cray compiler environment.
+          KOKKOS_INTERNAL_OPENMP_FLAG :=
+        else
+          KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+        endif
+      endif
+    endif
+  endif
+endif
+ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
+  KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_IBM_XL_OMP45_WORKAROUND -qsmp=omp -qoffload -qnoeh
+else
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_BUG_WORKAROUND_IBM_CLANG_OMP45_VIEW_INIT -fopenmp-implicit-declare-target -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp -fopenmp=libomp
+  endif
+endif
+
+# Set C++11 flags.
+ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+  KOKKOS_INTERNAL_CXX11_FLAG := --c++11
+else
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
+     KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+      KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
+        KOKKOS_INTERNAL_CXX11_FLAG := 
+      else
+        KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11
+        KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z
+      endif
+    endif
+  endif
+endif
+
+# Check for Kokkos Architecture settings.
+
+# Intel based.
+KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC)
+KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM)
+KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB)
+KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW)
+KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW)
+KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX)
+KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL)
+
+# NVIDIA based.
+NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper
+KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler30)
+KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler32)
+KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler35)
+KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler37)
+KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell50)
+KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell52)
+KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell53)
+KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pascal61)
+KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pascal60)
+KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta70)
+KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72)
+KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60)  \
+					      + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
+					      + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
+
+#SEK: This seems like a bug to me
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
+  KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell)
+  KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler)
+  KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
+                                                + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
+                                                + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
+                                                + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                                + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
+                                                + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60)  \
+						+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
+						+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+                                                + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+                                                + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+                                                + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1)
+  ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+      KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
+      CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
+      KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH)
+    endif
+  endif
+endif
+# ARM based.
+KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv80)
+KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv81)
+KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-ThunderX)
+KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-TX2)
+KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2) | bc))
+
+# IBM based.
+KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ)
+KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7)
+KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8)
+KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9)
+KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc))
+
+# AMD based.
+KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
+KOKKOS_INTERNAL_USE_ARCH_RYZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Ryzen)
+KOKKOS_INTERNAL_USE_ARCH_EPYC := $(call kokkos_has_string,$(KOKKOS_ARCH),Epyc)
+KOKKOS_INTERNAL_USE_ARCH_KAVERI := $(call kokkos_has_string,$(KOKKOS_ARCH),Kaveri)
+KOKKOS_INTERNAL_USE_ARCH_CARRIZO := $(call kokkos_has_string,$(KOKKOS_ARCH),Carrizo)
+KOKKOS_INTERNAL_USE_ARCH_FIJI := $(call kokkos_has_string,$(KOKKOS_ARCH),Fiji)
+KOKKOS_INTERNAL_USE_ARCH_VEGA := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega)
+KOKKOS_INTERNAL_USE_ARCH_GFX901 := $(call kokkos_has_string,$(KOKKOS_ARCH),gfx901)
+
+# Any AVX?
+KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
+KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
+KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW))
+KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
+KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))
+
+# Decide what ISA level we are able to support.
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))
+KOKKOS_INTERNAL_USE_ISA_KNC       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
+KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
+KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
+
+# Decide whether we can support transactional memory
+KOKKOS_INTERNAL_USE_TM            := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))
+
+# Incompatible flags?
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
+KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
+  $(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1)
+  $(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
+endif
+
+# Generating the list of Flags.
+
+KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
+KOKKOS_TPL_INCLUDE_DIRS =
+KOKKOS_TPL_LIBRARY_DIRS =
+KOKKOS_TPL_LIBRARY_NAMES =
+
+KOKKOS_CXXFLAGS =
+ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS)
+endif
+
+KOKKOS_LIBS = -ldl
+KOKKOS_TPL_LIBRARY_NAMES += dl
+KOKKOS_LDFLAGS = -L$(shell pwd)
+KOKKOS_LINK_FLAGS = 
+KOKKOS_SRC =
+KOKKOS_HEADERS =
+
+# Generating the KokkosCore_config.h file.
+
+KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
+KOKKOS_CONFIG_HEADER=KokkosCore_config.h
+# Functions for generating config header file
+kokkos_append_header = $(shell echo $1 >> $(KOKKOS_INTERNAL_CONFIG_TMP))
+
+# Do not append first line
+tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
+tmp := $(call kokkos_append_header,"Makefile constructed configuration:")
+tmp := $(call kokkos_append_header,"$(shell date)")
+tmp := $(call kokkos_append_header,"----------------------------------------------*/")
+
+tmp := $(call kokkos_append_header,'\#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)')
+tmp := $(call kokkos_append_header,'\#error "Do not include $(KOKKOS_CONFIG_HEADER) directly; include Kokkos_Macros.hpp instead."')
+tmp := $(call kokkos_append_header,'\#else')
+tmp := $(call kokkos_append_header,'\#define KOKKOS_CORE_CONFIG_H')
+tmp := $(call kokkos_append_header,'\#endif')
+	
+tmp := $(call kokkos_append_header,"/* Execution Spaces */")
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CUDA")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+  tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_ROCM')
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+  tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_OPENMPTARGET')
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+  tmp := $(call kokkos_append_header,'\#define KOKKOS_HAVE_OPENMP')
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_PTHREAD")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_QTHREADS")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_SERIAL")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_TM), 1)
+  tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_TM")
+  tmp := $(call kokkos_append_header,"\#endif")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
+  tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_X86_64")
+  tmp := $(call kokkos_append_header,"\#endif")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1)
+  tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_KNC")
+  tmp := $(call kokkos_append_header,"\#endif")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
+  tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_POWERPCLE")
+  tmp := $(call kokkos_append_header,"\#endif")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCBE), 1)
+  tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_POWERPCBE")
+  tmp := $(call kokkos_append_header,"\#endif")
+endif
+
+tmp := $(call kokkos_append_header,"/* General Settings */")
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CXX11")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CXX11")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CXX1Z")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+    KOKKOS_CXXFLAGS += -lineinfo
+  endif
+
+  KOKKOS_CXXFLAGS += -g
+  KOKKOS_LDFLAGS += -g -ldl
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_DEBUG")
+  ifeq ($(KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK), 0)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK")
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_PROFILING_LOAD_PRINT")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
+  ifneq ($(HWLOC_PATH),)
+    KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include
+    KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib
+    KOKKOS_TPL_INCLUDE_DIRS += $(HWLOC_PATH)/include
+    KOKKOS_TPL_LIBRARY_DIRS += $(HWLOC_PATH)/lib
+  endif
+  KOKKOS_LIBS += -lhwloc
+  KOKKOS_TPL_LIBRARY_NAMES += hwloc
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_HWLOC")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_LIBRT")
+  KOKKOS_LIBS += -lrt
+  KOKKOS_TPL_LIBRARY_NAMES += rt
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
+  ifneq ($(MEMKIND_PATH),)
+    KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
+    KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
+    KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include
+    KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib
+  endif
+  KOKKOS_LIBS += -lmemkind -lnuma
+  KOKKOS_TPL_LIBRARY_NAMES += memkind numa
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_HBWSPACE")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_DISABLE_PROFILING), 0)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_PROFILING")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 0)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
+endif
+
+tmp := $(call kokkos_append_header,"/* Optimization Settings */")
+
+ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION")
+endif
+
+tmp := $(call kokkos_append_header,"/* Cuda Settings */")
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LDG_INTRINSIC")
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+      tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LDG_INTRINSIC")
+    endif
+  endif
+
+  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_UVM")
+  endif
+
+  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE")
+    KOKKOS_CXXFLAGS += --relocatable-device-code=true
+    KOKKOS_LDFLAGS += --relocatable-device-code=true
+  endif
+
+  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+      ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0)
+        tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LAMBDA")
+        KOKKOS_CXXFLAGS += -expt-extended-lambda
+      else
+        $(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
+      endif
+    endif
+
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+      tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LAMBDA")
+    endif
+  endif
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_CLANG_WORKAROUND")
+  endif
+endif
+
+# Add Architecture flags.
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV80")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+    KOKKOS_CXXFLAGS +=
+    KOKKOS_LDFLAGS +=
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+      KOKKOS_CXXFLAGS +=
+      KOKKOS_LDFLAGS +=
+    else
+      KOKKOS_CXXFLAGS += -march=armv8-a
+      KOKKOS_LDFLAGS += -march=armv8-a
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV81")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+    KOKKOS_CXXFLAGS +=
+    KOKKOS_LDFLAGS +=
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+      KOKKOS_CXXFLAGS +=
+      KOKKOS_LDFLAGS +=
+    else
+      KOKKOS_CXXFLAGS += -march=armv8.1-a
+      KOKKOS_LDFLAGS += -march=armv8.1-a
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV80")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV8_THUNDERX")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+    KOKKOS_CXXFLAGS +=
+    KOKKOS_LDFLAGS +=
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+      KOKKOS_CXXFLAGS +=
+      KOKKOS_LDFLAGS +=
+    else
+      KOKKOS_CXXFLAGS += -march=armv8-a -mtune=thunderx
+      KOKKOS_LDFLAGS += -march=armv8-a -mtune=thunderx
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV81")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV8_THUNDERX2")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+    KOKKOS_CXXFLAGS +=
+    KOKKOS_LDFLAGS +=
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+      KOKKOS_CXXFLAGS +=
+      KOKKOS_LDFLAGS +=
+    else
+      KOKKOS_CXXFLAGS += -mtune=thunderx2t99 -mcpu=thunderx2t99
+      KOKKOS_LDFLAGS += -mtune=thunderx2t99 -mcpu=thunderx2t99
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_SSE42")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xSSE4.2
+    KOKKOS_LDFLAGS  += -xSSE4.2
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+        KOKKOS_CXXFLAGS += -tp=nehalem
+        KOKKOS_LDFLAGS  += -tp=nehalem
+      else
+        # Assume that this is a really a GNU compiler.
+        KOKKOS_CXXFLAGS += -msse4.2
+        KOKKOS_LDFLAGS  += -msse4.2
+      endif
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -mavx
+    KOKKOS_LDFLAGS  += -mavx
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+        KOKKOS_CXXFLAGS += -tp=sandybridge
+        KOKKOS_LDFLAGS  += -tp=sandybridge
+      else
+        # Assume that this is a really a GNU compiler.
+        KOKKOS_CXXFLAGS += -mavx
+        KOKKOS_LDFLAGS  += -mavx
+      endif
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER7), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER7")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+
+  else
+    # Assume that this is a really a GNU compiler or it could be XL on P8.
+    KOKKOS_CXXFLAGS += -mcpu=power7 -mtune=power7
+    KOKKOS_LDFLAGS  += -mcpu=power7 -mtune=power7
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER8")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
+        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+
+      else 
+        # Assume that this is a really a GNU compiler on P8.
+        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+      endif
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER9")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
+        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+
+      else 
+        # Assume that this is a really a GNU compiler on P9
+        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+      endif
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HSW), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX2")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xCORE-AVX2
+    KOKKOS_LDFLAGS  += -xCORE-AVX2
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+        KOKKOS_CXXFLAGS += -tp=haswell
+        KOKKOS_LDFLAGS  += -tp=haswell
+      else
+        # Assume that this is a really a GNU compiler.
+        KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
+        KOKKOS_LDFLAGS  += -march=core-avx2 -mtune=core-avx2
+      endif
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_BDW), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX2")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xCORE-AVX2
+    KOKKOS_LDFLAGS  += -xCORE-AVX2
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+        KOKKOS_CXXFLAGS += -tp=haswell
+        KOKKOS_LDFLAGS  += -tp=haswell
+      else
+        # Assume that this is a really a GNU compiler.
+        KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2 -mrtm
+        KOKKOS_LDFLAGS  += -march=core-avx2 -mtune=core-avx2 -mrtm
+      endif
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX512MIC")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xMIC-AVX512
+    KOKKOS_LDFLAGS  += -xMIC-AVX512
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+
+      else
+        # Asssume that this is really a GNU compiler.
+        KOKKOS_CXXFLAGS += -march=knl -mtune=knl
+        KOKKOS_LDFLAGS  += -march=knl -mtune=knl
+      endif
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX512XEON")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xCORE-AVX512
+    KOKKOS_LDFLAGS  += -xCORE-AVX512
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+
+      else
+        # Nothing here yet.
+        KOKKOS_CXXFLAGS += -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
+        KOKKOS_LDFLAGS  += -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
+      endif
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KNC")
+  KOKKOS_CXXFLAGS += -mmic
+  KOKKOS_LDFLAGS += -mmic
+endif
+
+# Figure out the architecture flag for Cuda.
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-arch
+  else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+		KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--cuda-gpu-arch
+		KOKKOS_CXXFLAGS += -x cuda
+  else
+    $(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang)
+  endif
+
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER30")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER32")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER35")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER37")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL50")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL52")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL53")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL60")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL61")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA70")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA72")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72
+  endif
+
+  ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
+    KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
+
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+      KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
+    endif
+  endif
+endif
+
+# Figure out the architecture flag for ROCm.
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+  # Lets start with adding architecture defines
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KAVERI), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 701")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KAVERI")
+    KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx701 
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_CARRIZO), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 801")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_CARRIZO")
+    KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx801 
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_FIJI), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 803")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_FIJI")
+    KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx803
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 900")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VEGA")
+    KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx900 
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_GFX901), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 901")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_GFX901")
+    KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx901 
+  endif
+ 
+  
+  KOKKOS_INTERNAL_HCC_PATH := $(shell which $(CXX))
+  ROCM_HCC_PATH ?= $(KOKKOS_INTERNAL_HCC_PATH:/bin/clang++=)
+
+  KOKKOS_CXXFLAGS += $(shell $(ROCM_HCC_PATH)/bin/hcc-config --cxxflags) 
+  KOKKOS_LDFLAGS += $(shell $(ROCM_HCC_PATH)/bin/hcc-config --ldflags) -lhc_am -lm 
+  KOKKOS_TPL_LIBRARY_NAMES += hc_am m
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_ROCM_ARCH_FLAG)
+
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/ROCm/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/ROCm/*.hpp)
+endif
+
+KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
+
+ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
+  KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
+else
+  KOKKOS_INTERNAL_NEW_CONFIG := 1
+endif
+
+ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
+  tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
+endif
+
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
+
+KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp)
+KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
+  ifneq ($(CUDA_PATH),)
+    KOKKOS_CPPFLAGS += -I$(CUDA_PATH)/include
+    KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
+    KOKKOS_TPL_INCLUDE_DIRS += $(CUDA_PATH)/include
+    KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib64
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+      KOKKOS_CXXFLAGS += --cuda-path=$(CUDA_PATH)
+    endif
+  endif
+  KOKKOS_LIBS += -lcudart -lcuda
+  KOKKOS_TPL_LIBRARY_NAMES += cudart cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+  KOKKOS_SRC += $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMPTarget/*.hpp)
+  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+    KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG)
+  else
+    KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG)
+  endif
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+    KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
+  else
+    KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
+  endif
+
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
+  KOKKOS_LINK_FLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+  KOKKOS_LIBS += -lpthread
+  KOKKOS_TPL_LIBRARY_NAMES += pthread
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
+  ifneq ($(QTHREADS_PATH),)
+    KOKKOS_CPPFLAGS += -I$(QTHREADS_PATH)/include
+    KOKKOS_LDFLAGS += -L$(QTHREADS_PATH)/lib
+    KOKKOS_TPL_INCLUDE_DIRS += $(QTHREADS_PATH)/include
+    KOKKOS_TPL_LIBRARY_DIRS += $(QTHREADS_PATH)/lib64
+  endif
+  KOKKOS_LIBS += -lqthread
+  KOKKOS_TPL_LIBRARY_NAMES += qthread
+endif
+
+# Explicitly set the GCC Toolchain for Clang.
+ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+  KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
+  KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=)
+  KOKKOS_CXXFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN)
+  KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN)
+endif
+
+# Don't include Kokkos_HBWSpace.cpp if not using MEMKIND to avoid a link warning.
+ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
+  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC))
+endif
+
+# Don't include Kokkos_Serial.cpp or Kokkos_Serial_Task.cpp if not using Serial
+# device to avoid a link warning.
+ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp,$(KOKKOS_SRC))
+  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp,$(KOKKOS_SRC))
+endif
+
+# With Cygwin functions such as fdopen and fileno are not defined
+# when strict ansi is enabled. strict ansi gets enabled with --std=c++11
+# though. So we hard undefine it here. Not sure if that has any bad side effects
+# This is needed for gtest actually, not for Kokkos itself!
+ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1)
+  KOKKOS_CXXFLAGS += -U__STRICT_ANSI__
+endif
+
+# Set KokkosExtraLibs and add -lkokkos to link line
+KOKKOS_EXTRA_LIBS := ${KOKKOS_LIBS}
+KOKKOS_LIBS := -lkokkos ${KOKKOS_LIBS}
+
+# Setting up dependencies.
+
+KokkosCore_config.h:
+
+KOKKOS_CPP_DEPENDS := KokkosCore_config.h $(KOKKOS_HEADERS)
+
+KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o)
+KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ))
+
+include $(KOKKOS_PATH)/Makefile.targets
+
+kokkos-clean:
+	rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a
+
+libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS)
+	ar cr libkokkos.a $(KOKKOS_OBJ_LINK)
+	ranlib libkokkos.a
+
+KOKKOS_LINK_DEPENDS=libkokkos.a
diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets
new file mode 100644
index 0000000000000000000000000000000000000000..a63598577c720a7b522a5f10192aa413046564b9
--- /dev/null
+++ b/packages/kokkos/Makefile.targets
@@ -0,0 +1,87 @@
+Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
+Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
+Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
+Kokkos_Error.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp
+Kokkos_ExecPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ExecPolicy.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_ExecPolicy.cpp
+Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp
+Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
+Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
+Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
+Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
+Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
+Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
+Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
+Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
+Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
+Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
+Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
+Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+Kokkos_ROCm_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Exec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Exec.cpp
+Kokkos_ROCm_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Space.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Space.cpp
+Kokkos_ROCm_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Task.cpp
+Kokkos_ROCm_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+Kokkos_QthreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
+Kokkos_Qthreads_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+Kokkos_OpenMP_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+Kokkos_OpenMPTargetSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
+#Kokkos_OpenMPTarget_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
+#       $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
+endif
+
+Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
diff --git a/packages/kokkos/README b/packages/kokkos/README
new file mode 100644
index 0000000000000000000000000000000000000000..31d134bf0add74cf9b36571a3649cf5724f48087
--- /dev/null
+++ b/packages/kokkos/README
@@ -0,0 +1,188 @@
+Kokkos Core implements a programming model in C++ for writing performance portable
+applications targeting all major HPC platforms. For that purpose it provides
+abstractions for both parallel execution of code and data management.
+Kokkos is designed to target complex node architectures with N-level memory
+hierarchies and multiple types of execution resources. It currently can use
+OpenMP, Pthreads and CUDA as backend programming models.
+
+Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem,
+which also provides math kernels (https://github.com/kokkos/kokkos-kernels), as well as 
+profiling and debugging tools (https://github.com/kokkos/kokkos-tools).  
+
+# Learning about Kokkos
+
+A programming guide can be found on the Wiki, the API reference is under development.
+
+For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
+
+For non-public questions send an email to
+crtrott(at)sandia.gov
+
+A separate repository with extensive tutorial material can be found under 
+https://github.com/kokkos/kokkos-tutorials.
+
+Furthermore, the 'example/tutorial' directory provides step by step tutorial
+examples which explain many of the features of Kokkos. They work with
+simple Makefiles. To build with g++ and OpenMP simply type 'make'
+in the 'example/tutorial' directory. This will build all examples in the
+subfolders. To change the build options refer to the Programming Guide
+in the compilation section.
+
+To learn more about Kokkos consider watching one of our presentations:
+* GTC 2015:
+  - http://on-demand.gputechconf.com/gtc/2015/video/S5166.html
+  - http://on-demand.gputechconf.com/gtc/2015/presentation/S5166-H-Carter-Edwards.pdf
+
+
+# Contributing to Kokkos
+
+We are open and try to encourage contributions from external developers. 
+To do so please first open an issue describing the contribution and then issue
+a pull request against the develop branch. For larger features it may be good
+to get guidance from the core development team first through the github issue. 
+
+Note that Kokkos Core is licensed under standard 3-clause BSD terms of use. 
+Which means contributing to Kokkos allows anyone else to use your contributions
+not just for public purposes but also for closed source commercial projects.
+For specifics see the LICENSE file contained in the repository or distribution.
+
+# Requirements
+
+### Primary tested compilers on X86 are:
+  * GCC 4.8.4
+  * GCC 4.9.3
+  * GCC 5.1.0
+  * GCC 5.3.0
+  * GCC 6.1.0
+  * Intel 15.0.2
+  * Intel 16.0.1
+  * Intel 17.1.043
+  * Intel 17.4.196
+  * Intel 18.0.128
+  * Clang 3.6.1
+  * Clang 3.7.1
+  * Clang 3.8.1
+  * Clang 3.9.0
+  * Clang 4.0.0
+  * Clang 4.0.0 for CUDA (CUDA Toolkit 8.0.44)
+  * Clang 6.0.0 for CUDA (CUDA Toolkit 9.1)
+  * PGI 17.10
+  * NVCC 7.0 for CUDA (with gcc 4.8.4)
+  * NVCC 7.5 for CUDA (with gcc 4.8.4)
+  * NVCC 8.0.44 for CUDA (with gcc 5.3.0)
+  * NVCC 9.1 for CUDA (with gcc 6.1.0)
+
+### Primary tested compilers on Power 8 are:
+  * GCC 5.4.0 (OpenMP,Serial)
+  * IBM XL 13.1.6 (OpenMP, Serial)
+  * NVCC 8.0.44 for CUDA (with gcc 5.4.0)
+  * NVCC 9.0.103 for CUDA (with gcc 6.3.0 and XL 13.1.6)
+
+### Primary tested compilers on Intel KNL are:
+  * GCC 6.2.0
+  * Intel 16.4.258 (with gcc 4.7.2)
+  * Intel 17.2.174 (with gcc 4.9.3)
+  * Intel 18.0.128 (with gcc 4.9.3)
+
+### Primary tested compilers on ARM
+  * GCC 6.1.0 
+  
+### Other compilers working:
+  * X86:
+   - Cygwin 2.1.0 64bit with gcc 4.9.3
+
+### Known non-working combinations:
+  * Power8:
+   - Pthreads backend
+  * ARM
+   - Pthreads backend
+
+
+Primary tested compiler are passing in release mode
+with warnings as errors. They also are tested with a comprehensive set of 
+backend combinations (i.e. OpenMP, Pthreads, Serial, OpenMP+Serial, ...).
+We are using the following set of flags:
+GCC:   -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits
+       -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
+Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
+Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
+NVCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
+
+Other compilers are tested occasionally, in particular when pushing from develop to 
+master branch, without -Werror and only for a select set of backends.
+
+# Running Unit Tests
+
+To run the unit tests create a build directory and run the following commands
+
+KOKKOS_PATH/generate_makefile.bash
+make build-test
+make test
+
+Run KOKKOS_PATH/generate_makefile.bash --help for more detailed options such as
+changing the device type for which to build.
+
+# Installing the library
+
+To install Kokkos as a library create a build directory and run the following
+
+KOKKOS_PATH/generate_makefile.bash --prefix=INSTALL_PATH
+make kokkoslib
+make install
+
+KOKKOS_PATH/generate_makefile.bash --help for more detailed options such as
+changing the device type for which to build.
+
+Note that in many cases it is preferable to build Kokkos inline with an 
+application. The main reason is that you may otherwise need many different
+configurations of Kokkos installed depending on the required compile time
+features an application needs. For example there is only one default 
+execution space, which means you need different installations to have OpenMP
+or Pthreads as the default space. Also for the CUDA backend there are certain
+choices, such as allowing relocatable device code, which must be made at 
+installation time. Building Kokkos inline uses largely the same process
+as compiling an application against an installed Kokkos library. See for 
+example benchmarks/bytes_and_flops/Makefile which can be used with an installed
+library and for an inline build.  
+
+### CMake
+
+Kokkos supports being build as part of a CMake applications. An example can 
+be found in example/cmake_build. 
+
+# Kokkos and CUDA UVM
+
+Kokkos does support UVM as a specific memory space called CudaUVMSpace. 
+Allocations made with that space are accessible from host and device. 
+You can tell Kokkos to use that as the default space for Cuda allocations.
+In either case UVM comes with a number of restrictions:
+(i) You can't access allocations on the host while a kernel is potentially 
+running. This will lead to segfaults. To avoid that you either need to 
+call Kokkos::Cuda::fence() (or just Kokkos::fence()), after kernels, or
+you can set the environment variable CUDA_LAUNCH_BLOCKING=1.
+Furthermore in multi socket multi GPU machines without NVLINK, UVM defaults 
+to using zero copy allocations for technical reasons related to using multiple
+GPUs from the same process. If an executable doesn't do that (e.g. each
+MPI rank of an application uses a single GPU [can be the same GPU for 
+multiple MPI ranks]) you can set CUDA_MANAGED_FORCE_DEVICE_ALLOC=1.
+This will enforce proper UVM allocations, but can lead to errors if 
+more than a single GPU is used by a single process.
+
+
+# Citing Kokkos
+
+If you publish work which mentions Kokkos, please cite the following paper:
+
+@article{CarterEdwards20143202,
+title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ",
+journal = "Journal of Parallel and Distributed Computing ",
+volume = "74",
+number = "12",
+pages = "3202 - 3216",
+year = "2014",
+note = "Domain-Specific Languages and High-Level Frameworks for High-Performance Computing ",
+issn = "0743-7315",
+doi = "https://doi.org/10.1016/j.jpdc.2014.07.003",
+url = "http://www.sciencedirect.com/science/article/pii/S0743731514001257",
+author = "H. Carter Edwards and Christian R. Trott and Daniel Sunderland"
+}
diff --git a/packages/kokkos/algorithms/CMakeLists.txt b/packages/kokkos/algorithms/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..507c9f2fdb70662c80f591d04737d45752be81a1
--- /dev/null
+++ b/packages/kokkos/algorithms/CMakeLists.txt
@@ -0,0 +1,12 @@
+
+
+TRIBITS_SUBPACKAGE(Algorithms)
+
+IF(KOKKOS_HAS_TRILINOS)
+  ADD_SUBDIRECTORY(src)
+ENDIF()
+
+TRIBITS_ADD_TEST_DIRECTORIES(unit_tests)
+#TRIBITS_ADD_TEST_DIRECTORIES(performance_tests)
+
+TRIBITS_SUBPACKAGE_POSTPROCESS()
diff --git a/packages/kokkos/algorithms/cmake/Dependencies.cmake b/packages/kokkos/algorithms/cmake/Dependencies.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c36b62523fadb628e970b6eccf57a9caaa317f1e
--- /dev/null
+++ b/packages/kokkos/algorithms/cmake/Dependencies.cmake
@@ -0,0 +1,5 @@
+TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
+  LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
+  TEST_OPTIONAL_TPLS CUSPARSE
+  )
diff --git a/packages/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in b/packages/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..67334b70f36b6db55b225f25c91d8a8c4cb3aaab
--- /dev/null
+++ b/packages/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in
@@ -0,0 +1,4 @@
+#ifndef KOKKOS_ALGORITHMS_CONFIG_H
+#define KOKKOS_ALGORITHMS_CONFIG_H
+
+#endif
diff --git a/packages/kokkos/algorithms/src/CMakeLists.txt b/packages/kokkos/algorithms/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dfbf3323c2d51953a12d8e82371d9f971aaa1e13
--- /dev/null
+++ b/packages/kokkos/algorithms/src/CMakeLists.txt
@@ -0,0 +1,21 @@
+
+TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+#-----------------------------------------------------------------------------
+
+FILE(GLOB HEADERS *.hpp)
+FILE(GLOB SOURCES *.cpp)
+LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
+
+#-----------------------------------------------------------------------------
+
+TRIBITS_ADD_LIBRARY(
+    kokkosalgorithms
+    HEADERS ${HEADERS}
+    SOURCES ${SOURCES}
+    DEPLIBS
+    )
+
diff --git a/packages/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp b/packages/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c08a088b0e108f78728fcc00742baaab441d4e2
--- /dev/null
+++ b/packages/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp
@@ -0,0 +1 @@
+void KOKKOS_ALGORITHMS_SRC_DUMMY_PREVENT_LINK_ERROR() {}
diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c659e44a45e5cd1d972f441f956ce4e47de2d2e
--- /dev/null
+++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -0,0 +1,2004 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_RANDOM_HPP
+#define KOKKOS_RANDOM_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Complex.hpp>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+/// \file Kokkos_Random.hpp
+/// \brief Pseudorandom number generators
+///
+/// These generators are based on Vigna, Sebastiano (2014). "An
+/// experimental exploration of Marsaglia's xorshift generators,
+/// scrambled."  See: http://arxiv.org/abs/1402.6246
+
+namespace Kokkos {
+
+  /*Template functions to get equidistributed random numbers from a generator for a specific Scalar type
+
+       template<class Generator,Scalar>
+       struct rand{
+
+         //Max value returned by draw(Generator& gen)
+         KOKKOS_INLINE_FUNCTION
+         static Scalar max();
+
+         //Returns a value between zero and max()
+         KOKKOS_INLINE_FUNCTION
+         static Scalar draw(Generator& gen);
+
+         //Returns a value between zero and range()
+         //Note: for floating point values range can be larger than max()
+         KOKKOS_INLINE_FUNCTION
+         static Scalar draw(Generator& gen, const Scalar& range){}
+
+         //Return value between start and end
+         KOKKOS_INLINE_FUNCTION
+         static Scalar draw(Generator& gen, const Scalar& start, const Scalar& end);
+      };
+
+    The Random number generators themselves have two components a state-pool and the actual generator
+    A state-pool manages a number of generators, so that each active thread is able to grep its own.
+    This allows the generation of random numbers which are independent between threads. Note that
+    in contrast to CuRand none of the functions of the pool (or the generator) are collectives,
+    i.e. all functions can be called inside conditionals.
+
+    template<class Device>
+    class Pool {
+     public:
+      //The Kokkos device type
+      typedef Device device_type;
+      //The actual generator type
+      typedef Generator<Device> generator_type;
+
+      //Default constructor: does not initialize a pool
+      Pool();
+
+      //Initializing constructor: calls init(seed,Device_Specific_Number);
+      Pool(unsigned int seed);
+
+      //Intialize Pool with seed as a starting seed with a pool_size of num_states
+      //The Random_XorShift64 generator is used in serial to initialize all states,
+      //thus the intialization process is platform independent and deterministic.
+      void init(unsigned int seed, int num_states);
+
+      //Get a generator. This will lock one of the states, guaranteeing that each thread
+      //will have its private generator. Note: on Cuda getting a state involves atomics,
+      //and is thus not deterministic!
+      generator_type get_state();
+
+      //Give a state back to the pool. This unlocks the state, and writes the modified
+      //state of the generator back to the pool.
+      void free_state(generator_type gen);
+
+    }
+
+    template<class Device>
+    class Generator {
+     public:
+     //The Kokkos device type
+    typedef DeviceType device_type;
+
+    //Max return values of respective [X]rand[S]() functions
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+
+    //Init with a state and the idx with respect to pool. Note: in serial the
+    //Generator can be used by just giving it the necessary state arguments
+    KOKKOS_INLINE_FUNCTION
+    Generator (STATE_ARGUMENTS, int state_idx = 0);
+
+    //Draw a equidistributed uint32_t in the range (0,MAX_URAND]
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand();
+
+    //Draw a equidistributed uint64_t in the range (0,MAX_URAND64]
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64();
+
+    //Draw a equidistributed uint32_t in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range);
+
+    //Draw a equidistributed uint32_t in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end );
+
+    //Draw a equidistributed uint64_t in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range);
+
+    //Draw a equidistributed uint64_t in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end );
+
+    //Draw a equidistributed int in the range (0,MAX_RAND]
+    KOKKOS_INLINE_FUNCTION
+    int rand();
+
+    //Draw a equidistributed int in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range);
+
+    //Draw a equidistributed int in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end );
+
+    //Draw a equidistributed int64_t in the range (0,MAX_RAND64]
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64();
+
+    //Draw a equidistributed int64_t in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range);
+
+    //Draw a equidistributed int64_t in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end );
+
+    //Draw a equidistributed float in the range (0,1.0]
+    KOKKOS_INLINE_FUNCTION
+    float frand();
+
+    //Draw a equidistributed float in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range);
+
+    //Draw a equidistributed float in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end );
+
+    //Draw a equidistributed double in the range (0,1.0]
+    KOKKOS_INLINE_FUNCTION
+    double drand();
+
+    //Draw a equidistributed double in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range);
+
+    //Draw a equidistributed double in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end );
+
+    //Draw a standard normal distributed double
+    KOKKOS_INLINE_FUNCTION
+    double normal() ;
+
+    //Draw a normal distributed double with given mean and standard deviation
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0);
+    }
+
+    //Additional Functions:
+
+    //Fills view with random numbers in the range (0,range]
+    template<class ViewType, class PoolType>
+    void fill_random(ViewType view, PoolType pool, ViewType::value_type range);
+
+    //Fills view with random numbers in the range (start,end]
+    template<class ViewType, class PoolType>
+    void fill_random(ViewType view, PoolType pool,
+                     ViewType::value_type start, ViewType::value_type end);
+
+*/
+
+  template<class Generator, class Scalar>
+  struct rand;
+
+
+  template<class Generator>
+  struct rand<Generator,char> {
+
+    KOKKOS_INLINE_FUNCTION
+    static short max(){return 127;}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen)
+                          {return short((gen.rand()&0xff+256)%256);}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const char& range)
+                          {return char(gen.rand(range));}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const char& start, const char& end)
+                          {return char(gen.rand(start,end));}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,short> {
+    KOKKOS_INLINE_FUNCTION
+    static short max(){return 32767;}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen)
+                          {return short((gen.rand()&0xffff+65536)%32768);}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const short& range)
+                          {return short(gen.rand(range));}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const short& start, const short& end)
+                          {return short(gen.rand(start,end));}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,int> {
+    KOKKOS_INLINE_FUNCTION
+    static int max(){return Generator::MAX_RAND;}
+    KOKKOS_INLINE_FUNCTION
+    static int draw(Generator& gen)
+                          {return gen.rand();}
+    KOKKOS_INLINE_FUNCTION
+    static int draw(Generator& gen, const int& range)
+                          {return gen.rand(range);}
+    KOKKOS_INLINE_FUNCTION
+    static int draw(Generator& gen, const int& start, const int& end)
+                          {return gen.rand(start,end);}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,unsigned int> {
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int max () {
+      return Generator::MAX_URAND;
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int draw (Generator& gen) {
+      return gen.urand ();
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int draw(Generator& gen, const unsigned int& range) {
+      return gen.urand (range);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int
+    draw (Generator& gen, const unsigned int& start, const unsigned int& end) {
+      return gen.urand (start, end);
+    }
+  };
+
+  template<class Generator>
+  struct rand<Generator,long> {
+    KOKKOS_INLINE_FUNCTION
+    static long max () {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (Generator::MAX_RAND) :
+        static_cast<long> (Generator::MAX_RAND64);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (gen.rand ()) :
+        static_cast<long> (gen.rand64 ());
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long draw (Generator& gen, const long& range) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (gen.rand (static_cast<int> (range))) :
+        static_cast<long> (gen.rand64 (range));
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long draw (Generator& gen, const long& start, const long& end) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (gen.rand (static_cast<int> (start),
+                                     static_cast<int> (end))) :
+        static_cast<long> (gen.rand64 (start, end));
+    }
+  };
+
+  template<class Generator>
+  struct rand<Generator,unsigned long> {
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long max () {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (Generator::MAX_URAND) :
+        static_cast<unsigned long> (Generator::MAX_URAND64);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (gen.urand ()) :
+        static_cast<unsigned long> (gen.urand64 ());
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long draw(Generator& gen, const unsigned long& range) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (gen.urand (static_cast<unsigned int> (range))) :
+        static_cast<unsigned long> (gen.urand64 (range));
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long
+    draw (Generator& gen, const unsigned long& start, const unsigned long& end) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (gen.urand (static_cast<unsigned int> (start),
+                                               static_cast<unsigned int> (end))) :
+        static_cast<unsigned long> (gen.urand64 (start, end));
+    }
+  };
+
+  // NOTE (mfh 26 oct 2014) This is a partial specialization for long
+  // long, a C99 / C++11 signed type which is guaranteed to be at
+  // least 64 bits.  Do NOT write a partial specialization for
+  // int64_t!!!  This is just a typedef!  It could be either long or
+  // long long.  We don't know which a priori, and I've seen both.
+  // The types long and long long are guaranteed to differ, so it's
+  // always safe to specialize for both.
+  template<class Generator>
+  struct rand<Generator, long long> {
+    KOKKOS_INLINE_FUNCTION
+    static long long max () {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return Generator::MAX_RAND64;
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.rand64 ();
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long long draw (Generator& gen, const long long& range) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.rand64 (range);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long long draw (Generator& gen, const long long& start, const long long& end) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.rand64 (start, end);
+    }
+  };
+
+  // NOTE (mfh 26 oct 2014) This is a partial specialization for
+  // unsigned long long, a C99 / C++11 unsigned type which is
+  // guaranteed to be at least 64 bits.  Do NOT write a partial
+  // specialization for uint64_t!!!  This is just a typedef!  It could
+  // be either unsigned long or unsigned long long.  We don't know
+  // which a priori, and I've seen both.  The types unsigned long and
+  // unsigned long long are guaranteed to differ, so it's always safe
+  // to specialize for both.
+  template<class Generator>
+  struct rand<Generator,unsigned long long> {
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long max () {
+      // FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64 bits.
+      return Generator::MAX_URAND64;
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64 bits.
+      return gen.urand64 ();
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long draw (Generator& gen, const unsigned long long& range) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.urand64 (range);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long
+    draw (Generator& gen, const unsigned long long& start, const unsigned long long& end) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.urand64 (start, end);
+    }
+  };
+
+  template<class Generator>
+  struct rand<Generator,float> {
+    KOKKOS_INLINE_FUNCTION
+    static float max(){return 1.0f;}
+    KOKKOS_INLINE_FUNCTION
+    static float draw(Generator& gen)
+                          {return gen.frand();}
+    KOKKOS_INLINE_FUNCTION
+    static float draw(Generator& gen, const float& range)
+                          {return gen.frand(range);}
+    KOKKOS_INLINE_FUNCTION
+    static float draw(Generator& gen, const float& start, const float& end)
+                          {return gen.frand(start,end);}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,double> {
+    KOKKOS_INLINE_FUNCTION
+    static double max(){return 1.0;}
+    KOKKOS_INLINE_FUNCTION
+    static double draw(Generator& gen)
+                          {return gen.drand();}
+    KOKKOS_INLINE_FUNCTION
+    static double draw(Generator& gen, const double& range)
+                          {return gen.drand(range);}
+    KOKKOS_INLINE_FUNCTION
+    static double draw(Generator& gen, const double& start, const double& end)
+                          {return gen.drand(start,end);}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator, Kokkos::complex<float> > {
+    KOKKOS_INLINE_FUNCTION
+    static Kokkos::complex<float> max () {
+      return Kokkos::complex<float> (1.0, 1.0);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static Kokkos::complex<float> draw (Generator& gen) {
+      const float re = gen.frand ();
+      const float im = gen.frand ();
+      return Kokkos::complex<float> (re, im);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static Kokkos::complex<float> draw (Generator& gen, const Kokkos::complex<float>& range) {
+      const float re = gen.frand (real (range));
+      const float im = gen.frand (imag (range));
+      return Kokkos::complex<float> (re, im);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static Kokkos::complex<float> draw (Generator& gen, const Kokkos::complex<float>& start, const Kokkos::complex<float>& end) {
+      const float re = gen.frand (real (start), real (end));
+      const float im = gen.frand (imag (start), imag (end));
+      return Kokkos::complex<float> (re, im);
+    }
+  };
+
+  template<class Generator>
+  struct rand<Generator, Kokkos::complex<double> > {
+    KOKKOS_INLINE_FUNCTION
+    static Kokkos::complex<double> max () {
+      return Kokkos::complex<double> (1.0, 1.0);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static Kokkos::complex<double> draw (Generator& gen) {
+      const double re = gen.drand ();
+      const double im = gen.drand ();
+      return Kokkos::complex<double> (re, im);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static Kokkos::complex<double> draw (Generator& gen, const Kokkos::complex<double>& range) {
+      const double re = gen.drand (real (range));
+      const double im = gen.drand (imag (range));
+      return Kokkos::complex<double> (re, im);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static Kokkos::complex<double> draw (Generator& gen, const Kokkos::complex<double>& start, const Kokkos::complex<double>& end) {
+      const double re = gen.drand (real (start), real (end));
+      const double im = gen.drand (imag (start), imag (end));
+      return Kokkos::complex<double> (re, im);
+    }
+  };
+
+  template<class DeviceType>
+  class Random_XorShift64_Pool;
+
+  template<class DeviceType>
+  class Random_XorShift64 {
+  private:
+    uint64_t state_;
+    const int state_idx_;
+    friend class Random_XorShift64_Pool<DeviceType>;
+  public:
+
+    typedef DeviceType device_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffff/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffLL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift64 (uint64_t state, int state_idx = 0)
+     : state_(state==0?uint64_t(1318319):state),state_idx_(state_idx){}
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      state_ ^= state_ >> 12;
+      state_ ^= state_ << 25;
+      state_ ^= state_ >> 27;
+
+      uint64_t tmp = state_ * 2685821657736338717ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      state_ ^= state_ >> 12;
+      state_ ^= state_ << 25;
+      state_ ^= state_ >> 27;
+      return (state_ * 2685821657736338717ULL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        tmp = urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        tmp = urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        tmp = rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        tmp = rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return drand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = 2.0*drand() - 1.0;
+        const double V = 2.0*drand() - 1.0;
+        S = U*U+V*V;
+      }
+      return U*std::sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+
+  };
+
+  template<class DeviceType = Kokkos::DefaultExecutionSpace>
+  class Random_XorShift64_Pool {
+  private:
+    typedef View<int*,DeviceType> lock_type;
+    typedef View<uint64_t*,DeviceType> state_data_type;
+    lock_type locks_;
+    state_data_type state_;
+    int num_states_;
+
+  public:
+    typedef Random_XorShift64<DeviceType> generator_type;
+    typedef DeviceType device_type;
+
+    Random_XorShift64_Pool() {
+      num_states_ = 0;
+    }
+    Random_XorShift64_Pool(uint64_t seed) {
+      num_states_ = 0;
+      init(seed,DeviceType::max_hardware_threads());
+    }
+
+    Random_XorShift64_Pool(const Random_XorShift64_Pool& src):
+      locks_(src.locks_),
+      state_(src.state_),
+      num_states_(src.num_states_)
+    {}
+
+    Random_XorShift64_Pool operator = (const Random_XorShift64_Pool& src) {
+      locks_ = src.locks_;
+      state_ = src.state_;
+      num_states_ = src.num_states_;
+      return *this;
+    }
+
+    void init(uint64_t seed, int num_states) {
+      if(seed==0)
+        seed = uint64_t(1318319);
+
+      num_states_ = num_states;
+
+      locks_ = lock_type("Kokkos::Random_XorShift64::locks",num_states_);
+      state_ = state_data_type("Kokkos::Random_XorShift64::state",num_states_);
+
+      typename state_data_type::HostMirror h_state = create_mirror_view(state_);
+      typename lock_type::HostMirror h_lock = create_mirror_view(locks_);
+
+      // Execute on the HostMirror's default execution space.
+      Random_XorShift64<typename state_data_type::HostMirror::execution_space> gen(seed,0);
+      for(int i = 0; i < 17; i++)
+        gen.rand();
+      for(int i = 0; i < num_states_; i++) {
+        int n1 = gen.rand();
+        int n2 = gen.rand();
+        int n3 = gen.rand();
+        int n4 = gen.rand();
+        h_state(i) = (((static_cast<uint64_t>(n1)) & 0xffff)<<00) |
+                     (((static_cast<uint64_t>(n2)) & 0xffff)<<16) |
+                     (((static_cast<uint64_t>(n3)) & 0xffff)<<32) |
+                     (((static_cast<uint64_t>(n4)) & 0xffff)<<48);
+        h_lock(i) = 0;
+      }
+      deep_copy(state_,h_state);
+      deep_copy(locks_,h_lock);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift64<DeviceType> get_state() const {
+      const int i = DeviceType::hardware_thread_id();;
+      return Random_XorShift64<DeviceType>(state_(i),i);
+    }
+
+    // NOTE: state_idx MUST be unique and less than num_states
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift64<DeviceType> get_state(const int state_idx) const {
+      return Random_XorShift64<DeviceType>(state_(state_idx),state_idx);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void free_state(const Random_XorShift64<DeviceType>& state) const {
+      state_(state.state_idx_) = state.state_;
+    }
+  };
+
+
+  template<class DeviceType>
+  class Random_XorShift1024_Pool;
+
+  template<class DeviceType>
+  class Random_XorShift1024 {
+  private:
+    int p_;
+    const int state_idx_;
+    uint64_t state_[16];
+    friend class Random_XorShift1024_Pool<DeviceType>;
+  public:
+
+    typedef Random_XorShift1024_Pool<DeviceType> pool_type;
+    typedef DeviceType device_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx){
+      for(int i=0 ; i<16; i++)
+        state_[i] = state(state_idx,i);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      uint64_t state_0 = state_[ p_ ];
+      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      uint64_t state_0 = state_[ p_ ];
+      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        tmp = urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        tmp = urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        tmp = rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        tmp = rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return frand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = 2.0*drand() - 1.0;
+        const double V = 2.0*drand() - 1.0;
+        S = U*U+V*V;
+      }
+      return U*std::sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+  };
+
+
+  template<class DeviceType = Kokkos::DefaultExecutionSpace>
+  class Random_XorShift1024_Pool {
+  private:
+    typedef View<int*,DeviceType> int_view_type;
+    typedef View<uint64_t*[16],DeviceType> state_data_type;
+
+    int_view_type locks_;
+    state_data_type state_;
+    int_view_type p_;
+    int num_states_;
+    friend class Random_XorShift1024<DeviceType>;
+
+  public:
+    typedef Random_XorShift1024<DeviceType> generator_type;
+
+    typedef DeviceType device_type;
+
+    Random_XorShift1024_Pool() {
+      num_states_ = 0;
+    }
+
+    inline
+    Random_XorShift1024_Pool(uint64_t seed){
+      num_states_ = 0;
+      init(seed,DeviceType::max_hardware_threads());
+    }
+
+    Random_XorShift1024_Pool(const Random_XorShift1024_Pool& src):
+      locks_(src.locks_),
+      state_(src.state_),
+      p_(src.p_),
+      num_states_(src.num_states_)
+    {}
+
+    Random_XorShift1024_Pool operator = (const Random_XorShift1024_Pool& src) {
+      locks_ = src.locks_;
+      state_ = src.state_;
+      p_ = src.p_;
+      num_states_ = src.num_states_;
+      return *this;
+    }
+
+    inline
+    void init(uint64_t seed, int num_states) {
+      if(seed==0)
+        seed = uint64_t(1318319);
+      num_states_ = num_states;
+      locks_ = int_view_type("Kokkos::Random_XorShift1024::locks",num_states_);
+      state_ = state_data_type("Kokkos::Random_XorShift1024::state",num_states_);
+      p_ = int_view_type("Kokkos::Random_XorShift1024::p",num_states_);
+
+      typename state_data_type::HostMirror h_state = create_mirror_view(state_);
+      typename int_view_type::HostMirror h_lock = create_mirror_view(locks_);
+      typename int_view_type::HostMirror h_p = create_mirror_view(p_);
+
+      // Execute on the HostMirror's default execution space.
+      Random_XorShift64<typename state_data_type::HostMirror::execution_space> gen(seed,0);
+      for(int i = 0; i < 17; i++)
+        gen.rand();
+      for(int i = 0; i < num_states_; i++) {
+        for(int j = 0; j < 16 ; j++) {
+          int n1 = gen.rand();
+          int n2 = gen.rand();
+          int n3 = gen.rand();
+          int n4 = gen.rand();
+          h_state(i,j) = (((static_cast<uint64_t>(n1)) & 0xffff)<<00) |
+                         (((static_cast<uint64_t>(n2)) & 0xffff)<<16) |
+                         (((static_cast<uint64_t>(n3)) & 0xffff)<<32) |
+                         (((static_cast<uint64_t>(n4)) & 0xffff)<<48);
+        }
+        h_p(i) = 0;
+        h_lock(i) = 0;
+      }
+      deep_copy(state_,h_state);
+      deep_copy(locks_,h_lock);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024<DeviceType> get_state() const {
+      const int i = DeviceType::hardware_thread_id();
+      return Random_XorShift1024<DeviceType>(state_,p_(i),i);
+    };
+
+    // NOTE: state_idx MUST be unique and less than num_states
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024<DeviceType> get_state(const int state_idx) const {
+      return Random_XorShift1024<DeviceType>(state_,p_(state_idx),state_idx);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void free_state(const Random_XorShift1024<DeviceType>& state) const {
+      for(int i = 0; i<16; i++)
+        state_(state.state_idx_,i) = state.state_[i];
+      p_(state.state_idx_) = state.p_;
+    }
+  };
+
+#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
+
+  template<>
+  class Random_XorShift1024<Kokkos::Cuda> {
+  private:
+    int p_;
+    const int state_idx_;
+    uint64_t* state_;
+    const int stride_;
+    friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
+  public:
+
+    typedef Kokkos::Cuda device_type;
+    typedef Random_XorShift1024_Pool<device_type> pool_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return frand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = 2.0*drand() - 1.0;
+        const double V = 2.0*drand() - 1.0;
+        S = U*U+V*V;
+      }
+      return U*std::sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+  };
+
+template<>
+inline
+Random_XorShift64_Pool<Kokkos::Cuda>::Random_XorShift64_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift64<Kokkos::Cuda> Random_XorShift64_Pool<Kokkos::Cuda>::get_state() const {
+#ifdef __CUDA_ARCH__
+  const int i_offset = (threadIdx.x*blockDim.y + threadIdx.y)*blockDim.z+threadIdx.z;
+  int i = (((blockIdx.x*gridDim.y+blockIdx.y)*gridDim.z + blockIdx.z) *
+           blockDim.x*blockDim.y*blockDim.z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim.x*blockDim.y*blockDim.z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift64<Kokkos::Cuda>(state_(i),i);
+#else
+  return Random_XorShift64<Kokkos::Cuda>(state_(0),0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift64_Pool<Kokkos::Cuda>::free_state(const Random_XorShift64<Kokkos::Cuda> &state) const {
+  state_(state.state_idx_) = state.state_;
+#ifdef __CUDA_ARCH__
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+template<>
+inline
+Random_XorShift1024_Pool<Kokkos::Cuda>::Random_XorShift1024_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_state() const {
+#ifdef __CUDA_ARCH__
+  const int i_offset = (threadIdx.x*blockDim.y + threadIdx.y)*blockDim.z+threadIdx.z;
+  int i = (((blockIdx.x*gridDim.y+blockIdx.y)*gridDim.z + blockIdx.z) *
+           blockDim.x*blockDim.y*blockDim.z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim.x*blockDim.y*blockDim.z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i);
+#else
+  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift1024<Kokkos::Cuda> &state) const {
+  for(int i=0; i<16; i++)
+    state_(state.state_idx_,i) = state.state_[i];
+#ifdef __CUDA_ARCH__
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCM) 
+
+  template<>
+  class Random_XorShift1024<Kokkos::Experimental::ROCm> {
+  private:
+    int p_;
+    const int state_idx_;
+    uint64_t* state_;
+    const int stride_;
+    friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
+  public:
+
+    typedef Kokkos::Experimental::ROCm device_type;
+    typedef Random_XorShift1024_Pool<device_type> pool_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return frand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = 2.0*drand() - 1.0;
+        const double V = 2.0*drand() - 1.0;
+        S = U*U+V*V;
+      }
+      return U*std::sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+  };
+
+template<>
+inline
+Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
+#ifdef __HCC_ACCELERATOR__
+  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
+  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
+           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim_x*blockDim_y*blockDim_z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
+#else
+  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
+#ifdef __HCC_ACCELERATOR__
+  state_(state.state_idx_) = state.state_;
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+template<>
+inline
+Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
+#ifdef __HCC_ACCELERATOR__
+  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
+  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
+           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim_x*blockDim_y*blockDim_z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
+#else
+  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
+#ifdef __HCC_ACCELERATOR__
+  for(int i=0; i<16; i++)
+    state_(state.state_idx_,i) = state.state_[i];
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+#endif
+
+
+namespace Impl {
+
+template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
+struct fill_random_functor_range;
+template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
+struct fill_random_functor_begin_end;
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,1,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const IndexType& i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0)))
+        a(idx) = Rand::draw(gen,range);
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,2,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          a(idx,k) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,3,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.extent(2));l++)
+            a(idx,k,l) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,4, IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.extent(2));l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.extent(3));m++)
+              a(idx,k,l,m) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,5,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.extent(2));l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.extent(3));m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.extent(4));n++)
+              a(idx,k,l,m,n) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,6,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.extent(2));l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.extent(3));m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.extent(4));n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.extent(5));o++)
+              a(idx,k,l,m,n,o) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,7,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.extent(2));l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.extent(3));m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.extent(4));n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.extent(5));o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.extent(6));p++)
+              a(idx,k,l,m,n,o,p) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,8,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.extent(2));l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.extent(3));m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.extent(4));n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.extent(5));o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.extent(6));p++)
+                    for(IndexType q=0;q<static_cast<IndexType>(a.extent(7));q++)
+              a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0)))
+        a(idx) = Rand::draw(gen,begin,end);
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          a(idx,k) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.extent(2));l++)
+            a(idx,k,l) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.extent(2));l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.extent(3));m++)
+              a(idx,k,l,m) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))){
+        for(IndexType l=0;l<static_cast<IndexType>(a.extent(1));l++)
+          for(IndexType m=0;m<static_cast<IndexType>(a.extent(2));m++)
+            for(IndexType n=0;n<static_cast<IndexType>(a.extent(3));n++)
+              for(IndexType o=0;o<static_cast<IndexType>(a.extent(4));o++)
+          a(idx,l,m,n,o) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.extent(2));l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.extent(3));m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.extent(4));n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.extent(5));o++)
+          a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.extent(2));l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.extent(3));m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.extent(4));n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.extent(5));o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.extent(6));p++)
+            a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.extent(0))) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.extent(1));k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.extent(2));l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.extent(3));m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.extent(4));n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.extent(5));o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.extent(6));p++)
+                    for(IndexType q=0;q<static_cast<IndexType>(a.extent(7));q++)
+              a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+}
+
+template<class ViewType, class RandomPool, class IndexType = int64_t>
+void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) {
+  int64_t LDA = a.extent(0);
+  if(LDA>0)
+    parallel_for((LDA+127)/128,Impl::fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,range));
+}
+
+template<class ViewType, class RandomPool, class IndexType = int64_t>
+void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) {
+  int64_t LDA = a.extent(0);
+  if(LDA>0)
+    parallel_for((LDA+127)/128,Impl::fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,begin,end));
+}
+}
+
+#endif
diff --git a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..888476045b8ee4a424e668a5da71567c408af934
--- /dev/null
+++ b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -0,0 +1,562 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_SORT_HPP_
+#define KOKKOS_SORT_HPP_
+
+#include <Kokkos_Core.hpp>
+
+#include <algorithm>
+
+namespace Kokkos {
+
+  namespace Impl {
+
+  template< class DstViewType , class SrcViewType
+          , int Rank = DstViewType::Rank >
+  struct CopyOp;
+
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,1> {
+    KOKKOS_INLINE_FUNCTION
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
+      dst(i_dst) = src(i_src);
+    }
+  };
+
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,2> {
+    KOKKOS_INLINE_FUNCTION
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
+      for(int j = 0;j< (int) dst.extent(1); j++)
+        dst(i_dst,j) = src(i_src,j);
+    }
+  };
+
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,3> {
+    KOKKOS_INLINE_FUNCTION
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
+      for(int j = 0; j<dst.extent(1); j++)
+        for(int k = 0; k<dst.extent(2); k++)
+          dst(i_dst,j,k) = src(i_src,j,k);
+    }
+  };
+  }
+
+//----------------------------------------------------------------------------
+
+template< class KeyViewType
+        , class BinSortOp
+        , class Space = typename KeyViewType::device_type
+        , class SizeType = typename KeyViewType::memory_space::size_type
+        >
+class BinSort {
+public:
+
+  template< class DstViewType , class SrcViewType >
+  struct copy_functor {
+
+    typedef typename SrcViewType::const_type  src_view_type ;
+
+    typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
+
+    DstViewType     dst_values ;
+    src_view_type   src_values ;
+    int             dst_offset ;
+
+    copy_functor( DstViewType  const & dst_values_
+                , int          const & dst_offset_
+                , SrcViewType  const & src_values_
+                )
+      : dst_values( dst_values_ )
+      , src_values( src_values_ )
+      , dst_offset( dst_offset_ )
+      {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i) const {
+      copy_op::copy(dst_values,i+dst_offset,src_values,i);
+    }
+  };
+
+  template< class DstViewType
+          , class PermuteViewType
+          , class SrcViewType
+          >
+  struct copy_permute_functor {
+
+    // If a Kokkos::View then can generate constant random access
+    // otherwise can only use the constant type.
+
+    typedef typename std::conditional
+      < Kokkos::is_view< SrcViewType >::value
+      , Kokkos::View< typename SrcViewType::const_data_type
+                    , typename SrcViewType::array_layout
+                    , typename SrcViewType::device_type
+                    , Kokkos::MemoryTraits<Kokkos::RandomAccess>
+                    >
+      , typename SrcViewType::const_type
+      >::type src_view_type ;
+
+    typedef typename PermuteViewType::const_type  perm_view_type ;
+
+    typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
+
+    DstViewType     dst_values ;
+    perm_view_type  sort_order ;
+    src_view_type   src_values ;
+    int             src_offset ;
+
+    copy_permute_functor( DstViewType     const & dst_values_
+                        , PermuteViewType const & sort_order_
+                        , SrcViewType     const & src_values_
+                        , int             const & src_offset_
+                        )
+      : dst_values( dst_values_ )
+      , sort_order( sort_order_ )
+      , src_values( src_values_ )
+      , src_offset( src_offset_ )
+      {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i)  const {
+      copy_op::copy(dst_values,i,src_values,src_offset+sort_order(i));
+    }
+  };
+
+  typedef typename Space::execution_space  execution_space;
+  typedef BinSortOp bin_op_type;
+
+  struct bin_count_tag {};
+  struct bin_offset_tag {};
+  struct bin_binning_tag {};
+  struct bin_sort_bins_tag {};
+
+public:
+
+  typedef SizeType size_type;
+  typedef size_type value_type;
+
+  typedef Kokkos::View<size_type*, Space> offset_type;
+  typedef Kokkos::View<const int*, Space> bin_count_type;
+
+  typedef typename KeyViewType::const_type  const_key_view_type ;
+
+  // If a Kokkos::View then can generate constant random access
+  // otherwise can only use the constant type.
+
+  typedef typename std::conditional
+    < Kokkos::is_view< KeyViewType >::value
+    , Kokkos::View< typename KeyViewType::const_data_type,
+                    typename KeyViewType::array_layout,
+                    typename KeyViewType::device_type,
+                    Kokkos::MemoryTraits<Kokkos::RandomAccess> >
+    , const_key_view_type
+    >::type const_rnd_key_view_type;
+
+  typedef typename KeyViewType::non_const_value_type non_const_key_scalar;
+  typedef typename KeyViewType::const_value_type     const_key_scalar;
+
+  typedef Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic_type ;
+
+private:
+
+  const_key_view_type keys;
+  const_rnd_key_view_type keys_rnd;
+
+public:
+
+  BinSortOp             bin_op ;
+  offset_type           bin_offsets ;
+  bin_count_atomic_type bin_count_atomic ;
+  bin_count_type        bin_count_const ;
+  offset_type           sort_order ;
+
+  int                   range_begin ;
+  int                   range_end ;
+  bool                  sort_within_bins ;
+
+public:
+
+  BinSort() {}
+
+  //----------------------------------------
+  // Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
+  BinSort( const_key_view_type  keys_
+         , int                  range_begin_
+         , int                  range_end_
+         , BinSortOp            bin_op_
+         , bool                 sort_within_bins_ = false
+         )
+     : keys(keys_)
+     , keys_rnd(keys_)
+     , bin_op(bin_op_)
+     , bin_offsets()
+     , bin_count_atomic()
+     , bin_count_const()
+     , sort_order()
+     , range_begin( range_begin_ )
+     , range_end( range_end_ )
+     , sort_within_bins( sort_within_bins_ )
+  {
+    bin_count_atomic = Kokkos::View<int*, Space >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
+    bin_count_const =  bin_count_atomic;
+    bin_offsets =      offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins());
+    sort_order =       offset_type("PermutationVector",range_end-range_begin);
+  }
+
+  BinSort( const_key_view_type  keys_
+         , BinSortOp            bin_op_
+         , bool                 sort_within_bins_ = false
+         )
+     : BinSort( keys_ , 0 , keys_.extent(0), bin_op_ , sort_within_bins_ ) {}
+
+  //----------------------------------------
+  // Create the permutation vector, the bin_offset array and the bin_count array. Can be called again if keys changed
+  void create_permute_vector() {
+    const size_t len = range_end - range_begin ;
+    Kokkos::parallel_for ("Kokkos::Sort::BinCount",Kokkos::RangePolicy<execution_space,bin_count_tag>    (0,len),*this);
+    Kokkos::parallel_scan("Kokkos::Sort::BinOffset",Kokkos::RangePolicy<execution_space,bin_offset_tag>   (0,bin_op.max_bins()) ,*this);
+
+    Kokkos::deep_copy(bin_count_atomic,0);
+    Kokkos::parallel_for ("Kokkos::Sort::BinBinning",Kokkos::RangePolicy<execution_space,bin_binning_tag>  (0,len),*this);
+
+    if(sort_within_bins)
+      Kokkos::parallel_for ("Kokkos::Sort::BinSort",Kokkos::RangePolicy<execution_space,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
+  }
+
+  // Sort a subset of a view with respect to the first dimension using the permutation array
+  template<class ValuesViewType>
+  void sort( ValuesViewType const & values
+           , int values_range_begin
+           , int values_range_end) const
+  {
+    typedef
+      Kokkos::View< typename ValuesViewType::data_type,
+                    typename ValuesViewType::array_layout,
+                    typename ValuesViewType::device_type >
+        scratch_view_type ;
+
+    const size_t len = range_end - range_begin ;
+    const size_t values_len = values_range_end - values_range_begin ;
+    if (len != values_len) {
+      Kokkos::abort("BinSort::sort: values range length != permutation vector length");
+    }
+
+    scratch_view_type
+      sorted_values("Scratch",
+                    len,
+                    values.extent(1),
+                    values.extent(2),
+                    values.extent(3),
+                    values.extent(4),
+                    values.extent(5),
+                    values.extent(6),
+                    values.extent(7));
+
+    {
+      copy_permute_functor< scratch_view_type /* DstViewType */
+                          , offset_type       /* PermuteViewType */
+                          , ValuesViewType    /* SrcViewType */
+                          >
+        functor( sorted_values , sort_order , values, values_range_begin - range_begin );
+
+      parallel_for("Kokkos::Sort::CopyPermute", Kokkos::RangePolicy<execution_space>(0,len),functor);
+    }
+
+    {
+      copy_functor< ValuesViewType , scratch_view_type >
+        functor( values , range_begin , sorted_values );
+
+      parallel_for("Kokkos::Sort::Copy", Kokkos::RangePolicy<execution_space>(0,len),functor);
+    }
+  }
+
+  template<class ValuesViewType>
+  void sort( ValuesViewType const & values ) const
+  {
+    this->sort( values, 0, /*values.extent(0)*/ range_end - range_begin );
+  }
+
+  // Get the permutation vector
+  KOKKOS_INLINE_FUNCTION
+  offset_type get_permute_vector() const { return sort_order;}
+
+  // Get the start offsets for each bin
+  KOKKOS_INLINE_FUNCTION
+  offset_type get_bin_offsets() const { return bin_offsets;}
+
+  // Get the count for each bin
+  KOKKOS_INLINE_FUNCTION
+  bin_count_type get_bin_count() const {return bin_count_const;}
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_count_tag& tag, const int& i) const {
+    const int j = range_begin + i ;
+    bin_count_atomic(bin_op.bin(keys, j))++;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_offset_tag& tag, const int& i, value_type& offset, const bool& final)  const {
+    if(final) {
+      bin_offsets(i) = offset;
+    }
+    offset+=bin_count_const(i);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_binning_tag& tag, const int& i)  const {
+    const int j     = range_begin + i ;
+    const int bin   = bin_op.bin(keys,j);
+    const int count = bin_count_atomic(bin)++;
+
+    sort_order(bin_offsets(bin) + count) = j ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_sort_bins_tag& tag, const int&i )  const {
+    bool sorted = false;
+    int upper_bound = bin_offsets(i)+bin_count_const(i);
+    while(!sorted) {
+      sorted = true;
+      int old_idx = sort_order(bin_offsets(i));
+      int new_idx;
+      for(int k=bin_offsets(i)+1; k<upper_bound; k++) {
+        new_idx = sort_order(k);
+
+        if(!bin_op(keys_rnd,old_idx,new_idx)) {
+          sort_order(k-1) = new_idx;
+          sort_order(k) = old_idx;
+          sorted = false;
+        } else {
+          old_idx = new_idx;
+        }
+      }
+      upper_bound--;
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<class KeyViewType>
+struct BinOp1D {
+  int max_bins_;
+  double mul_;
+  typename KeyViewType::const_value_type range_;
+  typename KeyViewType::const_value_type min_;
+
+  BinOp1D():max_bins_(0),mul_(0.0),
+            range_(typename KeyViewType::const_value_type()),
+            min_(typename KeyViewType::const_value_type()) {}
+
+  //Construct BinOp with number of bins, minimum value and maxuimum value
+  BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
+                               typename KeyViewType::const_value_type max )
+     :max_bins_(max_bins__+1),mul_(1.0*max_bins__/(max-min)),range_(max-min),min_(min) {}
+
+  //Determine bin index from key value
+  template<class ViewType>
+  KOKKOS_INLINE_FUNCTION
+  int bin(ViewType& keys, const int& i) const {
+    return int(mul_*(keys(i)-min_));
+  }
+
+  //Return maximum bin index + 1
+  KOKKOS_INLINE_FUNCTION
+  int max_bins() const {
+    return max_bins_;
+  }
+
+  //Compare to keys within a bin if true new_val will be put before old_val
+  template<class ViewType, typename iType1, typename iType2>
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ViewType& keys, iType1& i1, iType2& i2) const {
+    return keys(i1)<keys(i2);
+  }
+};
+
+template<class KeyViewType>
+struct BinOp3D {
+  int max_bins_[3];
+  double mul_[3];
+  typename KeyViewType::non_const_value_type range_[3];
+  typename KeyViewType::non_const_value_type min_[3];
+
+  BinOp3D() {}
+
+  BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
+                               typename KeyViewType::const_value_type max[] )
+  {
+    max_bins_[0] = max_bins__[0];
+    max_bins_[1] = max_bins__[1];
+    max_bins_[2] = max_bins__[2];
+    mul_[0] = 1.0*max_bins__[0]/(max[0]-min[0]);
+    mul_[1] = 1.0*max_bins__[1]/(max[1]-min[1]);
+    mul_[2] = 1.0*max_bins__[2]/(max[2]-min[2]);
+    range_[0] = max[0]-min[0];
+    range_[1] = max[1]-min[1];
+    range_[2] = max[2]-min[2];
+    min_[0] = min[0];
+    min_[1] = min[1];
+    min_[2] = min[2];
+  }
+
+  template<class ViewType>
+  KOKKOS_INLINE_FUNCTION
+  int bin(ViewType& keys, const int& i) const {
+    return int( (((int(mul_[0]*(keys(i,0)-min_[0]))*max_bins_[1]) +
+                   int(mul_[1]*(keys(i,1)-min_[1])))*max_bins_[2]) +
+                   int(mul_[2]*(keys(i,2)-min_[2])));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int max_bins() const {
+    return max_bins_[0]*max_bins_[1]*max_bins_[2];
+  }
+
+  template<class ViewType, typename iType1, typename iType2>
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ViewType& keys, iType1& i1 , iType2& i2) const {
+    if (keys(i1,0)>keys(i2,0)) return true;
+    else if (keys(i1,0)==keys(i2,0)) {
+      if (keys(i1,1)>keys(i2,1)) return true;
+      else if (keys(i1,1)==keys(i2,2)) {
+        if (keys(i1,2)>keys(i2,2)) return true;
+      }
+    }
+    return false;
+  }
+};
+
+namespace Impl {
+
+template<class ViewType>
+bool try_std_sort(ViewType view) {
+  bool possible = true;
+  size_t stride[8] = { view.stride_0()
+                     , view.stride_1()
+                     , view.stride_2()
+                     , view.stride_3()
+                     , view.stride_4()
+                     , view.stride_5()
+                     , view.stride_6()
+                     , view.stride_7()
+                     };
+  possible  = possible && std::is_same<typename ViewType::memory_space, HostSpace>::value;
+  possible  = possible && (ViewType::Rank == 1);
+  possible  = possible && (stride[0] == 1);
+  if(possible)  {
+   std::sort(view.data(),view.data()+view.extent(0));
+  }
+  return possible;
+}
+
+template<class ViewType>
+struct min_max_functor {
+  typedef Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> minmax_scalar;
+
+  ViewType view;
+  min_max_functor(const ViewType& view_):view(view_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const size_t& i, minmax_scalar& minmax) const {
+    if(view(i) < minmax.min_val) minmax.min_val = view(i);
+    if(view(i) > minmax.max_val) minmax.max_val = view(i);
+  }
+};
+
+}
+
+template<class ViewType>
+void sort( ViewType const & view , bool const always_use_kokkos_sort = false)
+{
+  if(!always_use_kokkos_sort) {
+    if(Impl::try_std_sort(view)) return;
+  }
+  typedef BinOp1D<ViewType> CompType;
+
+  Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
+  Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
+  parallel_reduce("Kokkos::Sort::FindExtent",Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.extent(0)),
+                  Impl::min_max_functor<ViewType>(view),reducer);
+  if(result.min_val == result.max_val) return;
+  BinSort<ViewType, CompType> bin_sort(view,CompType(view.extent(0)/2,result.min_val,result.max_val),true);
+  bin_sort.create_permute_vector();
+  bin_sort.sort(view);
+}
+
+template<class ViewType>
+void sort( ViewType view
+         , size_t const begin
+         , size_t const end
+         )
+{
+  typedef Kokkos::RangePolicy<typename ViewType::execution_space> range_policy ;
+  typedef BinOp1D<ViewType> CompType;
+
+  Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
+  Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
+
+  parallel_reduce("Kokkos::Sort::FindExtent", range_policy( begin , end )
+                 , Impl::min_max_functor<ViewType>(view),reducer );
+
+  if(result.min_val == result.max_val) return;
+
+  BinSort<ViewType, CompType>
+    bin_sort(view,begin,end,CompType((end-begin)/2,result.min_val,result.max_val),true);
+
+  bin_sort.create_permute_vector();
+  bin_sort.sort(view,begin,end);
+}
+
+}
+
+#endif
diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f5aa24e9beda5eb528d9779f55d8860536629363
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt
@@ -0,0 +1,64 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
+
+IF(NOT KOKKOS_HAS_TRILINOS)
+  IF(KOKKOS_SEPARATE_LIBS)
+    set(TEST_LINK_TARGETS kokkoscore)
+  ELSE()
+    set(TEST_LINK_TARGETS kokkos)
+  ENDIF()
+ENDIF()
+
+SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest)
+INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR})
+
+# mfh 03 Nov 2017: The gtest library used here must have a different
+# name than that of the gtest library built in KokkosCore.  We can't
+# just refer to the library in KokkosCore's tests, because it's
+# possible to build only (e.g.,) KokkosAlgorithms tests, without
+# building KokkosCore tests.
+
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGTEST_HAS_PTHREAD=0")
+
+TRIBITS_ADD_LIBRARY(
+  kokkosalgorithms_gtest
+  HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h
+  SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc
+  TESTONLY
+  )
+
+SET(SOURCES
+  UnitTestMain.cpp 
+  TestCuda.cpp
+  )
+
+SET(LIBRARIES kokkoscore)
+
+IF(Kokkos_ENABLE_OpenMP)
+  LIST( APPEND SOURCES
+    TestOpenMP.cpp
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Serial)
+  LIST( APPEND SOURCES
+    TestSerial.cpp
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Pthread)
+  LIST( APPEND SOURCES
+    TestThreads.cpp
+  )
+ENDIF()
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkosalgorithms_gtest ${TEST_LINK_TARGETS}
+  )
diff --git a/packages/kokkos/algorithms/unit_tests/Makefile b/packages/kokkos/algorithms/unit_tests/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..b5848c451e6e1b4f817f3a446911d4d246f2e6d5
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/Makefile
@@ -0,0 +1,101 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/algorithms/unit_tests
+
+default: build_all
+	echo "End Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+else
+  CXX = g++
+endif
+
+CXXFLAGS = -O3
+LINK ?= $(CXX)
+LDFLAGS ?=
+override LDFLAGS += -lpthread
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests
+
+TEST_TARGETS =
+TARGETS =
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_Cuda
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+	OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_ROCm
+	TEST_TARGETS += test-rocm
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_Threads
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = TestOpenMP.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_OpenMP
+	TEST_TARGETS += test-openmp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+	OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_Serial
+	TEST_TARGETS += test-serial
+endif
+
+KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda
+
+KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm
+
+KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads
+
+KokkosAlgorithms_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_OpenMP
+
+KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Serial
+
+test-cuda: KokkosAlgorithms_UnitTest_Cuda
+	./KokkosAlgorithms_UnitTest_Cuda
+
+test-rocm: KokkosAlgorithms_UnitTest_ROCm
+	./KokkosAlgorithms_UnitTest_ROCm
+
+test-threads: KokkosAlgorithms_UnitTest_Threads
+	./KokkosAlgorithms_UnitTest_Threads
+
+test-openmp: KokkosAlgorithms_UnitTest_OpenMP
+	./KokkosAlgorithms_UnitTest_OpenMP
+
+test-serial: KokkosAlgorithms_UnitTest_Serial
+	./KokkosAlgorithms_UnitTest_Serial
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
diff --git a/packages/kokkos/algorithms/unit_tests/TestCuda.cpp b/packages/kokkos/algorithms/unit_tests/TestCuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86fdccd0e784e8bc56472782a87a31a050f468fe
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestCuda.cpp
@@ -0,0 +1,107 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <cstdint>
+#include <iostream>
+#include <iomanip>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+
+namespace Test {
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+  }
+  static void TearDownTestCase()
+  {
+  }
+};
+
+void cuda_test_random_xorshift64( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Cuda> >(num_draws);
+}
+
+void cuda_test_random_xorshift1024( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Cuda> >(num_draws);
+}
+
+
+#define CUDA_RANDOM_XORSHIFT64( num_draws )                                \
+  TEST_F( cuda, Random_XorShift64 ) {   \
+  cuda_test_random_xorshift64(num_draws);                                   \
+  }
+
+#define CUDA_RANDOM_XORSHIFT1024( num_draws )                                \
+  TEST_F( cuda, Random_XorShift1024 ) {   \
+  cuda_test_random_xorshift1024(num_draws);                                   \
+  }
+
+#define CUDA_SORT_UNSIGNED( size )                                \
+  TEST_F( cuda, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::Cuda, unsigned >(size);                                   \
+  }
+
+CUDA_RANDOM_XORSHIFT64(  132141141 )
+CUDA_RANDOM_XORSHIFT1024( 52428813 )
+CUDA_SORT_UNSIGNED(171)
+
+#undef CUDA_RANDOM_XORSHIFT64
+#undef CUDA_RANDOM_XORSHIFT1024
+#undef CUDA_SORT_UNSIGNED
+}
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTCUDA_PREVENT_LINK_ERROR() {}
+#endif  /* #ifdef KOKKOS_ENABLE_CUDA */
+
diff --git a/packages/kokkos/algorithms/unit_tests/TestOpenMP.cpp b/packages/kokkos/algorithms/unit_tests/TestOpenMP.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c4ddde7b7f71995bb25655cbbb5270b671db7f3e
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestOpenMP.cpp
@@ -0,0 +1,96 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_OPENMP
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+namespace Test {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+  }
+
+  static void TearDownTestCase()
+  {
+  }
+};
+
+#define OPENMP_RANDOM_XORSHIFT64( num_draws )                                \
+  TEST_F( openmp, Random_XorShift64 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::OpenMP> >(num_draws);                                   \
+  }
+
+#define OPENMP_RANDOM_XORSHIFT1024( num_draws )                                \
+  TEST_F( openmp, Random_XorShift1024 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::OpenMP> >(num_draws);                                   \
+  }
+
+#define OPENMP_SORT_UNSIGNED( size )                                \
+  TEST_F( openmp, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::OpenMP, unsigned >(size);                                   \
+  }
+
+OPENMP_RANDOM_XORSHIFT64( 10240000 )
+OPENMP_RANDOM_XORSHIFT1024( 10130144 )
+OPENMP_SORT_UNSIGNED(171)
+
+#undef OPENMP_RANDOM_XORSHIFT64
+#undef OPENMP_RANDOM_XORSHIFT1024
+#undef OPENMP_SORT_UNSIGNED
+} // namespace test
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
+#endif
+
diff --git a/packages/kokkos/algorithms/unit_tests/TestROCm.cpp b/packages/kokkos/algorithms/unit_tests/TestROCm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..15179509bbfc1fe1e193081c8387e237dfa2525c
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestROCm.cpp
@@ -0,0 +1,108 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_ROCM
+
+#include <cstdint>
+#include <iostream>
+#include <iomanip>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+
+namespace Test {
+
+class rocm : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+  }
+  static void TearDownTestCase()
+  {
+  }
+};
+
+void rocm_test_random_xorshift64( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::ROCm> >(num_draws);
+}
+
+void rocm_test_random_xorshift1024( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::ROCm> >(num_draws);
+}
+
+
+#define ROCM_RANDOM_XORSHIFT64( num_draws )  \
+  TEST_F( rocm, Random_XorShift64 ) {        \
+  rocm_test_random_xorshift64(num_draws);    \
+  }
+
+#define ROCM_RANDOM_XORSHIFT1024( num_draws )  \
+  TEST_F( rocm, Random_XorShift1024 ) {        \
+  rocm_test_random_xorshift1024(num_draws);    \
+  }
+
+#define ROCM_SORT_UNSIGNED( size )                                    \
+  TEST_F( rocm, SortUnsigned ) {                                      \
+      Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size);  \
+  }
+
+ROCM_RANDOM_XORSHIFT64(  132141141 )
+ROCM_RANDOM_XORSHIFT1024( 52428813 )
+ROCM_SORT_UNSIGNED(171)
+
+#undef ROCM_RANDOM_XORSHIFT64
+#undef ROCM_RANDOM_XORSHIFT1024
+#undef ROCM_SORT_UNSIGNED
+}
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {}
+#endif  /* #ifdef KOKKOS_ENABLE_ROCM */
+
diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..73bd416f2aba49e311884096c31db295b41f1066
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -0,0 +1,481 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_DUALVIEW_HPP
+#define KOKKOS_TEST_DUALVIEW_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <cmath>
+#include <chrono>
+
+namespace Test {
+
+namespace Impl{
+
+// This test runs the random number generators and uses some statistic tests to
+// check the 'goodness' of the random numbers:
+//    (i)   mean:         the mean is expected to be 0.5*RAND_MAX
+//    (ii)  variance:     the variance is 1/3*mean*mean
+//    (iii) covariance:   the covariance is 0
+//    (iv)  1-tupledistr: the mean, variance and covariance of a 1D Histrogram of random numbers
+//    (v)   3-tupledistr: the mean, variance and covariance of a 3D Histrogram of random numbers
+
+#define HIST_DIM3D 24
+#define HIST_DIM1D (HIST_DIM3D*HIST_DIM3D*HIST_DIM3D)
+
+struct RandomProperties {
+  uint64_t count;
+  double mean;
+  double variance;
+  double covariance;
+  double min;
+  double max;
+
+  KOKKOS_INLINE_FUNCTION
+  RandomProperties() {
+    count = 0;
+    mean = 0.0;
+    variance = 0.0;
+    covariance = 0.0;
+    min = 1e64;
+    max = -1e64;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  RandomProperties& operator+=(const RandomProperties& add) {
+    count      += add.count;
+    mean       += add.mean;
+    variance   += add.variance;
+    covariance += add.covariance;
+    min         = add.min<min?add.min:min;
+    max         = add.max>max?add.max:max;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+=(const volatile RandomProperties& add) volatile {
+    count      += add.count;
+    mean       += add.mean;
+    variance   += add.variance;
+    covariance += add.covariance;
+    min         = add.min<min?add.min:min;
+    max         = add.max>max?add.max:max;
+  }
+};
+
+template<class GeneratorPool, class Scalar>
+struct test_random_functor {
+  typedef typename GeneratorPool::generator_type rnd_type;
+
+  typedef RandomProperties value_type;
+  typedef typename GeneratorPool::device_type device_type;
+
+  GeneratorPool rand_pool;
+  const double mean;
+
+  // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
+  // an exclusive upper bound on the range of random numbers that
+  // draw() can generate.  However, for the float specialization, some
+  // implementations might violate this upper bound, due to rounding
+  // error.  Just in case, we leave an extra space at the end of each
+  // dimension, in the View types below.
+  typedef Kokkos::View<int[HIST_DIM1D+1],typename GeneratorPool::device_type> type_1d;
+  type_1d density_1d;
+  typedef Kokkos::View<int[HIST_DIM3D+1][HIST_DIM3D+1][HIST_DIM3D+1],typename GeneratorPool::device_type> type_3d;
+  type_3d density_3d;
+
+  test_random_functor (GeneratorPool rand_pool_, type_1d d1d, type_3d d3d) :
+    rand_pool (rand_pool_),
+    mean (0.5*Kokkos::rand<rnd_type,Scalar>::max ()),
+    density_1d (d1d),
+    density_3d (d3d)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, RandomProperties& prop) const {
+    using Kokkos::atomic_fetch_add;
+
+    rnd_type rand_gen = rand_pool.get_state();
+    for (int k = 0; k < 1024; ++k) {
+      const Scalar tmp = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen);
+      prop.count++;
+      prop.mean += tmp;
+      prop.variance += (tmp-mean)*(tmp-mean);
+      const Scalar tmp2 = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen);
+      prop.count++;
+      prop.mean += tmp2;
+      prop.variance += (tmp2-mean)*(tmp2-mean);
+      prop.covariance += (tmp-mean)*(tmp2-mean);
+      const Scalar tmp3 = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen);
+      prop.count++;
+      prop.mean += tmp3;
+      prop.variance += (tmp3-mean)*(tmp3-mean);
+      prop.covariance += (tmp2-mean)*(tmp3-mean);
+
+      // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to
+      // define an exclusive upper bound on the range of random
+      // numbers that draw() can generate.  However, for the float
+      // specialization, some implementations might violate this upper
+      // bound, due to rounding error.  Just in case, we have left an
+      // extra space at the end of each dimension of density_1d and
+      // density_3d.
+      //
+      // Please note that those extra entries might not get counted in
+      // the histograms.  However, if Kokkos::rand is broken and only
+      // returns values of max(), the histograms will still catch this
+      // indirectly, since none of the other values will be filled in.
+
+      const Scalar theMax = Kokkos::rand<rnd_type, Scalar>::max ();
+
+      const uint64_t ind1_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp / theMax);
+      const uint64_t ind2_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp2 / theMax);
+      const uint64_t ind3_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp3 / theMax);
+
+      const uint64_t ind1_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp / theMax);
+      const uint64_t ind2_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp2 / theMax);
+      const uint64_t ind3_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp3 / theMax);
+
+      atomic_fetch_add (&density_1d(ind1_1d), 1);
+      atomic_fetch_add (&density_1d(ind2_1d), 1);
+      atomic_fetch_add (&density_1d(ind3_1d), 1);
+      atomic_fetch_add (&density_3d(ind1_3d, ind2_3d, ind3_3d), 1);
+    }
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+template<class DeviceType>
+struct test_histogram1d_functor {
+  typedef RandomProperties value_type;
+  typedef typename DeviceType::execution_space execution_space;
+  typedef typename DeviceType::memory_space memory_space;
+
+  // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
+  // an exclusive upper bound on the range of random numbers that
+  // draw() can generate.  However, for the float specialization, some
+  // implementations might violate this upper bound, due to rounding
+  // error.  Just in case, we leave an extra space at the end of each
+  // dimension, in the View type below.
+  typedef Kokkos::View<int[HIST_DIM1D+1], memory_space> type_1d;
+  type_1d density_1d;
+  double mean;
+
+  test_histogram1d_functor (type_1d d1d, int num_draws) :
+    density_1d (d1d),
+    mean (1.0*num_draws/HIST_DIM1D*3)
+  {
+  }
+
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const typename memory_space::size_type i,
+              RandomProperties& prop) const
+  {
+    typedef typename memory_space::size_type size_type;
+    const double count = density_1d(i);
+    prop.mean += count;
+    prop.variance += 1.0 * (count - mean) * (count - mean);
+    //prop.covariance += 1.0*count*count;
+    prop.min = count < prop.min ? count : prop.min;
+    prop.max = count > prop.max ? count : prop.max;
+    if (i < static_cast<size_type> (HIST_DIM1D-1)) {
+      prop.covariance += (count - mean) * (density_1d(i+1) - mean);
+    }
+  }
+};
+
+template<class DeviceType>
+struct test_histogram3d_functor {
+  typedef RandomProperties value_type;
+  typedef typename DeviceType::execution_space execution_space;
+  typedef typename DeviceType::memory_space memory_space;
+
+  // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
+  // an exclusive upper bound on the range of random numbers that
+  // draw() can generate.  However, for the float specialization, some
+  // implementations might violate this upper bound, due to rounding
+  // error.  Just in case, we leave an extra space at the end of each
+  // dimension, in the View type below.
+  typedef Kokkos::View<int[HIST_DIM3D+1][HIST_DIM3D+1][HIST_DIM3D+1], memory_space> type_3d;
+  type_3d density_3d;
+  double mean;
+
+  test_histogram3d_functor (type_3d d3d, int num_draws) :
+    density_3d (d3d),
+    mean (1.0*num_draws/HIST_DIM1D)
+  {}
+
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const typename memory_space::size_type i,
+              RandomProperties& prop) const
+  {
+    typedef typename memory_space::size_type size_type;
+    const double count = density_3d(i/(HIST_DIM3D*HIST_DIM3D),
+                                    (i % (HIST_DIM3D*HIST_DIM3D))/HIST_DIM3D,
+                                    i % HIST_DIM3D);
+    prop.mean += count;
+    prop.variance += (count - mean) * (count - mean);
+    if (i < static_cast<size_type> (HIST_DIM1D-1)) {
+      const double count_next = density_3d((i+1)/(HIST_DIM3D*HIST_DIM3D),
+                                           ((i+1)%(HIST_DIM3D*HIST_DIM3D))/HIST_DIM3D,
+                                           (i+1)%HIST_DIM3D);
+      prop.covariance += (count - mean) * (count_next - mean);
+    }
+  }
+};
+
+//
+// Templated test that uses the above functors.
+//
+template <class RandomGenerator,class Scalar>
+struct test_random_scalar {
+  typedef typename RandomGenerator::generator_type rnd_type;
+
+  int pass_mean,pass_var,pass_covar;
+  int pass_hist1d_mean,pass_hist1d_var,pass_hist1d_covar;
+  int pass_hist3d_mean,pass_hist3d_var,pass_hist3d_covar;
+
+  test_random_scalar (typename test_random_functor<RandomGenerator,int>::type_1d& density_1d,
+                      typename test_random_functor<RandomGenerator,int>::type_3d& density_3d,
+                      RandomGenerator& pool,
+                      unsigned int num_draws)
+  {
+    using std::cout;
+    using std::endl;
+    using Kokkos::parallel_reduce;
+
+    {
+      cout << " -- Testing randomness properties" << endl;
+
+      RandomProperties result;
+      typedef test_random_functor<RandomGenerator, Scalar> functor_type;
+      parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result);
+
+      //printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2);
+      double tolerance = 1.6*std::sqrt(1.0/num_draws);
+      double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max();
+      double variance_expect = 1.0/3.0*mean_expect*mean_expect;
+      double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0;
+      double variance_eps = variance_expect/(result.variance/num_draws/3)-1.0;
+      double covariance_eps = result.covariance/num_draws/2/variance_expect;
+      pass_mean  = ((-tolerance < mean_eps) &&
+                    ( tolerance > mean_eps)) ? 1:0;
+      pass_var   = ((-1.5*tolerance < variance_eps) &&
+                    ( 1.5*tolerance > variance_eps)) ? 1:0;
+      pass_covar = ((-2.0*tolerance < covariance_eps) &&
+                    ( 2.0*tolerance > covariance_eps)) ? 1:0;
+      cout << "Pass: " << pass_mean
+           << " " << pass_var
+           << " " << mean_eps
+           << " " << variance_eps
+           << " " << covariance_eps
+           << " || " << tolerance << endl;
+    }
+    {
+      cout << " -- Testing 1-D histogram" << endl;
+
+      RandomProperties result;
+      typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type;
+      parallel_reduce (HIST_DIM1D, functor_type (density_1d, num_draws), result);
+
+      double tolerance = 6*std::sqrt(1.0/HIST_DIM1D);
+      double mean_expect = 1.0*num_draws*3/HIST_DIM1D;
+      double variance_expect = 1.0*num_draws*3/HIST_DIM1D*(1.0-1.0/HIST_DIM1D);
+      double covariance_expect = -1.0*num_draws*3/HIST_DIM1D/HIST_DIM1D;
+      double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
+      double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
+      double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
+      pass_hist1d_mean  = ((-0.0001 < mean_eps) &&
+                           ( 0.0001 > mean_eps)) ? 1:0;
+      pass_hist1d_var   = ((-0.07 < variance_eps) &&
+                           ( 0.07 > variance_eps)) ? 1:0;
+      pass_hist1d_covar = ((-0.06 < covariance_eps) &&
+                           ( 0.06 > covariance_eps)) ? 1:0;
+
+      cout << "Density 1D: " << mean_eps
+           << " " << variance_eps
+           << " " << (result.covariance/HIST_DIM1D/HIST_DIM1D)
+           << " || " << tolerance
+           << " " << result.min
+           << " " << result.max
+           << " || " << result.variance/HIST_DIM1D
+           << " " << 1.0*num_draws*3/HIST_DIM1D*(1.0-1.0/HIST_DIM1D)
+           << " || " << result.covariance/HIST_DIM1D
+           << " " << -1.0*num_draws*3/HIST_DIM1D/HIST_DIM1D
+           << endl;
+    }
+    {
+      cout << " -- Testing 3-D histogram" << endl;
+
+      RandomProperties result;
+      typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type;
+      parallel_reduce (HIST_DIM1D, functor_type (density_3d, num_draws), result);
+
+      double tolerance = 6*std::sqrt(1.0/HIST_DIM1D);
+      double mean_expect = 1.0*num_draws/HIST_DIM1D;
+      double variance_expect = 1.0*num_draws/HIST_DIM1D*(1.0-1.0/HIST_DIM1D);
+      double covariance_expect = -1.0*num_draws/HIST_DIM1D/HIST_DIM1D;
+      double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
+      double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
+      double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
+      pass_hist3d_mean  = ((-tolerance < mean_eps) &&
+                           ( tolerance > mean_eps)) ? 1:0;
+      pass_hist3d_var   = ((-1.2*tolerance < variance_eps) &&
+                           ( 1.2*tolerance > variance_eps)) ? 1:0;
+      pass_hist3d_covar = ((-tolerance < covariance_eps) &&
+                           ( tolerance > covariance_eps)) ? 1:0;
+
+      cout << "Density 3D: " << mean_eps
+           << " " << variance_eps
+           << " " << result.covariance/HIST_DIM1D/HIST_DIM1D
+           << " || " << tolerance
+           << " " << result.min
+           << " " << result.max << endl;
+    }
+  }
+};
+
+template <class RandomGenerator>
+void test_random(unsigned int num_draws)
+{
+  using std::cout;
+  using std::endl;
+  typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
+  typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
+
+
+  uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+  cout << "Test Seed:" << ticks << endl;
+
+  RandomGenerator pool(ticks);
+
+  cout << "Test Scalar=int" << endl;
+  test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_int.pass_mean,1);
+  ASSERT_EQ( test_int.pass_var,1);
+  ASSERT_EQ( test_int.pass_covar,1);
+  ASSERT_EQ( test_int.pass_hist1d_mean,1);
+  ASSERT_EQ( test_int.pass_hist1d_var,1);
+  ASSERT_EQ( test_int.pass_hist1d_covar,1);
+  ASSERT_EQ( test_int.pass_hist3d_mean,1);
+  ASSERT_EQ( test_int.pass_hist3d_var,1);
+  ASSERT_EQ( test_int.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cout << "Test Scalar=unsigned int" << endl;
+  test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_uint.pass_mean,1);
+  ASSERT_EQ( test_uint.pass_var,1);
+  ASSERT_EQ( test_uint.pass_covar,1);
+  ASSERT_EQ( test_uint.pass_hist1d_mean,1);
+  ASSERT_EQ( test_uint.pass_hist1d_var,1);
+  ASSERT_EQ( test_uint.pass_hist1d_covar,1);
+  ASSERT_EQ( test_uint.pass_hist3d_mean,1);
+  ASSERT_EQ( test_uint.pass_hist3d_var,1);
+  ASSERT_EQ( test_uint.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cout << "Test Scalar=int64_t" << endl;
+  test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_int64.pass_mean,1);
+  ASSERT_EQ( test_int64.pass_var,1);
+  ASSERT_EQ( test_int64.pass_covar,1);
+  ASSERT_EQ( test_int64.pass_hist1d_mean,1);
+  ASSERT_EQ( test_int64.pass_hist1d_var,1);
+  ASSERT_EQ( test_int64.pass_hist1d_covar,1);
+  ASSERT_EQ( test_int64.pass_hist3d_mean,1);
+  ASSERT_EQ( test_int64.pass_hist3d_var,1);
+  ASSERT_EQ( test_int64.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cout << "Test Scalar=uint64_t" << endl;
+  test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_uint64.pass_mean,1);
+  ASSERT_EQ( test_uint64.pass_var,1);
+  ASSERT_EQ( test_uint64.pass_covar,1);
+  ASSERT_EQ( test_uint64.pass_hist1d_mean,1);
+  ASSERT_EQ( test_uint64.pass_hist1d_var,1);
+  ASSERT_EQ( test_uint64.pass_hist1d_covar,1);
+  ASSERT_EQ( test_uint64.pass_hist3d_mean,1);
+  ASSERT_EQ( test_uint64.pass_hist3d_var,1);
+  ASSERT_EQ( test_uint64.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cout << "Test Scalar=float" << endl;
+  test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_float.pass_mean,1);
+  ASSERT_EQ( test_float.pass_var,1);
+  ASSERT_EQ( test_float.pass_covar,1);
+  ASSERT_EQ( test_float.pass_hist1d_mean,1);
+  ASSERT_EQ( test_float.pass_hist1d_var,1);
+  ASSERT_EQ( test_float.pass_hist1d_covar,1);
+  ASSERT_EQ( test_float.pass_hist3d_mean,1);
+  ASSERT_EQ( test_float.pass_hist3d_var,1);
+  ASSERT_EQ( test_float.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cout << "Test Scalar=double" << endl;
+  test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_double.pass_mean,1);
+  ASSERT_EQ( test_double.pass_var,1);
+  ASSERT_EQ( test_double.pass_covar,1);
+  ASSERT_EQ( test_double.pass_hist1d_mean,1);
+  ASSERT_EQ( test_double.pass_hist1d_var,1);
+  ASSERT_EQ( test_double.pass_hist1d_covar,1);
+  ASSERT_EQ( test_double.pass_hist3d_mean,1);
+  ASSERT_EQ( test_double.pass_hist3d_var,1);
+  ASSERT_EQ( test_double.pass_hist3d_covar,1);
+}
+}
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/packages/kokkos/algorithms/unit_tests/TestSerial.cpp b/packages/kokkos/algorithms/unit_tests/TestSerial.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9cf998f7732628bf71e8bc8ea90fcf6e41a27f3d
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestSerial.cpp
@@ -0,0 +1,100 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_SERIAL
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+
+//----------------------------------------------------------------------------
+
+
+namespace Test {
+
+class serial : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+  }
+
+  static void TearDownTestCase ()
+  {
+  }
+};
+
+#define SERIAL_RANDOM_XORSHIFT64( num_draws )  \
+  TEST_F( serial, Random_XorShift64 ) {                                \
+    Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Serial> >(num_draws); \
+  }
+
+#define SERIAL_RANDOM_XORSHIFT1024( num_draws )        \
+  TEST_F( serial, Random_XorShift1024 ) {                              \
+    Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Serial> >(num_draws); \
+  }
+
+#define SERIAL_SORT_UNSIGNED( size )                                \
+  TEST_F( serial, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::Serial, unsigned >(size);                                   \
+  }
+
+SERIAL_RANDOM_XORSHIFT64( 10240000 )
+SERIAL_RANDOM_XORSHIFT1024( 10130144 )
+SERIAL_SORT_UNSIGNED(171)
+
+#undef SERIAL_RANDOM_XORSHIFT64
+#undef SERIAL_RANDOM_XORSHIFT1024
+#undef SERIAL_SORT_UNSIGNED
+
+} // namespace Test
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTSERIAL_PREVENT_LINK_ERROR() {}
+#endif // KOKKOS_ENABLE_SERIAL
+
+
diff --git a/packages/kokkos/algorithms/unit_tests/TestSort.hpp b/packages/kokkos/algorithms/unit_tests/TestSort.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e0c646c199be04841fbf0354905231ec00abf322
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestSort.hpp
@@ -0,0 +1,347 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_HPP
+#define KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_HPP
+
+#include <gtest/gtest.h>
+#include<Kokkos_Core.hpp>
+#include<Kokkos_DynamicView.hpp>
+#include<Kokkos_Random.hpp>
+#include<Kokkos_Sort.hpp>
+
+namespace Test {
+
+namespace Impl{
+
+template<class ExecutionSpace, class Scalar>
+struct is_sorted_struct {
+  typedef unsigned int value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*,ExecutionSpace> keys;
+
+  is_sorted_struct(Kokkos::View<Scalar*,ExecutionSpace> keys_):keys(keys_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, unsigned int& count) const {
+    if(keys(i)>keys(i+1)) count++;
+  }
+};
+
+template<class ExecutionSpace, class Scalar>
+struct sum {
+  typedef double value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*,ExecutionSpace> keys;
+
+  sum(Kokkos::View<Scalar*,ExecutionSpace> keys_):keys(keys_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, double& count) const {
+    count+=keys(i);
+  }
+};
+
+template<class ExecutionSpace, class Scalar>
+struct bin3d_is_sorted_struct {
+  typedef unsigned int value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*[3],ExecutionSpace> keys;
+
+  int max_bins;
+  Scalar min;
+  Scalar max;
+
+  bin3d_is_sorted_struct(Kokkos::View<Scalar*[3],ExecutionSpace> keys_,int max_bins_,Scalar min_,Scalar max_):
+    keys(keys_),max_bins(max_bins_),min(min_),max(max_) {
+  }
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, unsigned int& count) const {
+    int ix1 = int ((keys(i,0)-min)/max * max_bins);
+    int iy1 = int ((keys(i,1)-min)/max * max_bins);
+    int iz1 = int ((keys(i,2)-min)/max * max_bins);
+    int ix2 = int ((keys(i+1,0)-min)/max * max_bins);
+    int iy2 = int ((keys(i+1,1)-min)/max * max_bins);
+    int iz2 = int ((keys(i+1,2)-min)/max * max_bins);
+
+    if (ix1>ix2)  count++;
+    else if(ix1==ix2) {
+      if (iy1>iy2)  count++;
+      else if ((iy1==iy2) && (iz1>iz2))  count++;
+    }
+  }
+};
+
+template<class ExecutionSpace, class Scalar>
+struct sum3D {
+  typedef double value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*[3],ExecutionSpace> keys;
+
+  sum3D(Kokkos::View<Scalar*[3],ExecutionSpace> keys_):keys(keys_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, double& count) const {
+    count+=keys(i,0);
+    count+=keys(i,1);
+    count+=keys(i,2);
+  }
+};
+
+template<class ExecutionSpace, typename KeyType>
+void test_1D_sort(unsigned int n,bool force_kokkos) {
+  typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType;
+  KeyViewType keys("Keys",n);
+
+  // Test sorting array with all numbers equal
+  Kokkos::deep_copy(keys,KeyType(1));
+  Kokkos::sort(keys,force_kokkos);
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
+  Kokkos::fill_random(keys,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
+
+  double sum_before = 0.0;
+  double sum_after = 0.0;
+  unsigned int sort_fails = 0;
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys),sum_before);
+
+  Kokkos::sort(keys,force_kokkos);
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys),sum_after);
+  Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys),sort_fails);
+
+  double ratio = sum_before/sum_after;
+  double epsilon = 1e-10;
+  unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
+
+  ASSERT_EQ(sort_fails,0);
+  ASSERT_EQ(equal_sum,1);
+}
+
+template<class ExecutionSpace, typename KeyType>
+void test_3D_sort(unsigned int n) {
+  typedef Kokkos::View<KeyType*[3],ExecutionSpace > KeyViewType;
+
+  KeyViewType keys("Keys",n*n*n);
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
+  Kokkos::fill_random(keys,g,100.0);
+
+  double sum_before = 0.0;
+  double sum_after = 0.0;
+  unsigned int sort_fails = 0;
+
+  Kokkos::parallel_reduce(keys.extent(0),sum3D<ExecutionSpace, KeyType>(keys),sum_before);
+
+  int bin_1d = 1;
+  while( bin_1d*bin_1d*bin_1d*4< (int) keys.extent(0) ) bin_1d*=2;
+  int bin_max[3] = {bin_1d,bin_1d,bin_1d};
+  typename KeyViewType::value_type min[3] = {0,0,0};
+  typename KeyViewType::value_type max[3] = {100,100,100};
+
+  typedef Kokkos::BinOp3D< KeyViewType > BinOp;
+  BinOp bin_op(bin_max,min,max);
+  Kokkos::BinSort< KeyViewType , BinOp >
+    Sorter(keys,bin_op,false);
+  Sorter.create_permute_vector();
+  Sorter.template sort< KeyViewType >(keys);
+
+  Kokkos::parallel_reduce(keys.extent(0),sum3D<ExecutionSpace, KeyType>(keys),sum_after);
+  Kokkos::parallel_reduce(keys.extent(0)-1,bin3d_is_sorted_struct<ExecutionSpace, KeyType>(keys,bin_1d,min[0],max[0]),sort_fails);
+
+  double ratio = sum_before/sum_after;
+  double epsilon = 1e-10;
+  unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
+
+  if ( sort_fails )
+    printf("3D Sort Sum: %f %f Fails: %u\n",sum_before,sum_after,sort_fails);
+
+  ASSERT_EQ(sort_fails,0);
+  ASSERT_EQ(equal_sum,1);
+}
+
+//----------------------------------------------------------------------------
+
+template<class ExecutionSpace, typename KeyType>
+void test_dynamic_view_sort(unsigned int n )
+{
+  typedef Kokkos::Experimental::DynamicView<KeyType*,ExecutionSpace> KeyDynamicViewType;
+  typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType;
+
+  const size_t upper_bound = 2 * n ;
+  const size_t min_chunk_size = 1024;
+
+  KeyDynamicViewType keys("Keys", min_chunk_size, upper_bound);
+
+  keys.resize_serial(n);
+
+  KeyViewType keys_view("KeysTmp", n );
+
+  // Test sorting array with all numbers equal
+  Kokkos::deep_copy(keys_view,KeyType(1));
+  Kokkos::deep_copy(keys,keys_view);
+  Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
+  Kokkos::fill_random(keys_view,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
+
+  ExecutionSpace::fence();
+  Kokkos::deep_copy(keys,keys_view);
+  //ExecutionSpace::fence();
+
+  double sum_before = 0.0;
+  double sum_after = 0.0;
+  unsigned int sort_fails = 0;
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_before);
+
+  Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
+
+  ExecutionSpace::fence(); // Need this fence to prevent BusError with Cuda
+  Kokkos::deep_copy( keys_view , keys );
+  //ExecutionSpace::fence();
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_after);
+  Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys_view),sort_fails);
+
+  double ratio = sum_before/sum_after;
+  double epsilon = 1e-10;
+  unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
+
+  if ( sort_fails != 0 || equal_sum != 1 ) {
+    std::cout << " N = " << n
+              << " ; sum_before = " << sum_before
+              << " ; sum_after = " << sum_after
+              << " ; ratio = " << ratio
+              << std::endl ;
+  }
+
+  ASSERT_EQ(sort_fails,0);
+  ASSERT_EQ(equal_sum,1);
+}
+
+//----------------------------------------------------------------------------
+
+template<class ExecutionSpace>
+void test_issue_1160()
+{
+  Kokkos::View<int*, ExecutionSpace> element_("element", 10);
+  Kokkos::View<double*, ExecutionSpace> x_("x", 10);
+  Kokkos::View<double*, ExecutionSpace> v_("y", 10);
+
+  auto h_element = Kokkos::create_mirror_view(element_);
+  auto h_x = Kokkos::create_mirror_view(x_);
+  auto h_v = Kokkos::create_mirror_view(v_);
+
+  h_element(0) = 9;
+  h_element(1) = 8;
+  h_element(2) = 7;
+  h_element(3) = 6;
+  h_element(4) = 5;
+  h_element(5) = 4;
+  h_element(6) = 3;
+  h_element(7) = 2;
+  h_element(8) = 1;
+  h_element(9) = 0;
+
+  for (int i = 0; i < 10; ++i) {
+    h_v.access(i, 0) = h_x.access(i, 0) = double(h_element(i));
+  }
+  Kokkos::deep_copy(element_, h_element);
+  Kokkos::deep_copy(x_, h_x);
+  Kokkos::deep_copy(v_, h_v);
+
+  typedef decltype(element_) KeyViewType;
+  typedef Kokkos::BinOp1D< KeyViewType > BinOp;
+
+  int begin = 3;
+  int end = 8;
+  auto max = h_element(begin);
+  auto min = h_element(end - 1);
+  BinOp binner(end - begin, min, max);
+
+  Kokkos::BinSort<KeyViewType , BinOp > Sorter(element_,begin,end,binner,false);
+  Sorter.create_permute_vector();
+  Sorter.sort(element_,begin,end);
+
+  Sorter.sort(x_,begin,end);
+  Sorter.sort(v_,begin,end);
+
+  Kokkos::deep_copy(h_element, element_);
+  Kokkos::deep_copy(h_x, x_);
+  Kokkos::deep_copy(h_v, v_);
+
+  ASSERT_EQ(h_element(0), 9);
+  ASSERT_EQ(h_element(1), 8);
+  ASSERT_EQ(h_element(2), 7);
+  ASSERT_EQ(h_element(3), 2);
+  ASSERT_EQ(h_element(4), 3);
+  ASSERT_EQ(h_element(5), 4);
+  ASSERT_EQ(h_element(6), 5);
+  ASSERT_EQ(h_element(7), 6);
+  ASSERT_EQ(h_element(8), 1);
+  ASSERT_EQ(h_element(9), 0);
+
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(h_element(i), int(h_x.access(i, 0)));
+    ASSERT_EQ(h_element(i), int(h_v.access(i, 0)));
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template<class ExecutionSpace, typename KeyType>
+void test_sort(unsigned int N)
+{
+  test_1D_sort<ExecutionSpace,KeyType>(N*N*N, true);
+  test_1D_sort<ExecutionSpace,KeyType>(N*N*N, false);
+#if !defined(KOKKOS_ENABLE_ROCM)
+  test_3D_sort<ExecutionSpace,KeyType>(N);
+  test_dynamic_view_sort<ExecutionSpace,KeyType>(N*N);
+#endif
+  test_issue_1160<ExecutionSpace>();
+}
+
+}
+}
+#endif /* KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_HPP */
diff --git a/packages/kokkos/algorithms/unit_tests/TestThreads.cpp b/packages/kokkos/algorithms/unit_tests/TestThreads.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99cdb7da92a253d9469b956c27ee06f212421c0d
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/TestThreads.cpp
@@ -0,0 +1,102 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_THREADS
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+
+//----------------------------------------------------------------------------
+
+
+namespace Test {
+
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+  }
+
+  static void TearDownTestCase()
+  {
+  }
+};
+
+#define THREADS_RANDOM_XORSHIFT64( num_draws )                                \
+  TEST_F( threads, Random_XorShift64 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Threads> >(num_draws);                                   \
+  }
+
+#define THREADS_RANDOM_XORSHIFT1024( num_draws )                                \
+  TEST_F( threads, Random_XorShift1024 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Threads> >(num_draws);                                   \
+  }
+
+#define THREADS_SORT_UNSIGNED( size )                                \
+  TEST_F( threads, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::Threads, double >(size);                                   \
+  }
+
+
+THREADS_RANDOM_XORSHIFT64( 10240000 )
+THREADS_RANDOM_XORSHIFT1024( 10130144 )
+THREADS_SORT_UNSIGNED(171)
+
+#undef THREADS_RANDOM_XORSHIFT64
+#undef THREADS_RANDOM_XORSHIFT1024
+#undef THREADS_SORT_UNSIGNED
+
+} // namespace Test
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTTHREADS_PREVENT_LINK_ERROR() {}
+#endif
+
+
diff --git a/packages/kokkos/algorithms/unit_tests/UnitTestMain.cpp b/packages/kokkos/algorithms/unit_tests/UnitTestMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8feb08332fa4d976395edfaf59b8e8653484c0f2
--- /dev/null
+++ b/packages/kokkos/algorithms/unit_tests/UnitTestMain.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+int main(int argc, char *argv[]) {
+  Kokkos::initialize(argc,argv);
+  ::testing::InitGoogleTest(&argc,argv);
+  int result = RUN_ALL_TESTS();
+  Kokkos::finalize();
+  return result;
+}
+
diff --git a/packages/kokkos/benchmarks/atomic/Makefile b/packages/kokkos/benchmarks/atomic/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..64b43917de2eadf2f2f96419e83b74cee05265fa
--- /dev/null
+++ b/packages/kokkos/benchmarks/atomic/Makefile
@@ -0,0 +1,44 @@
+KOKKOS_PATH = ${HOME}/kokkos
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+EXE_NAME = "test"
+
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+EXE = ${EXE_NAME}.cuda
+KOKKOS_CUDA_OPTIONS = "enable_lambda"
+else
+CXX = g++
+EXE = ${EXE_NAME}.host
+endif
+
+CXXFLAGS = -O3
+
+LINK = ${CXX}
+LINKFLAGS = -O3
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/packages/kokkos/benchmarks/atomic/main.cpp b/packages/kokkos/benchmarks/atomic/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d86d196249b337882993544101d93df35e6ccb22
--- /dev/null
+++ b/packages/kokkos/benchmarks/atomic/main.cpp
@@ -0,0 +1,124 @@
+#include<Kokkos_Core.hpp>
+#include<impl/Kokkos_Timer.hpp>
+#include<Kokkos_Random.hpp>
+
+template<class Scalar>
+double test_atomic(int L, int N, int M,int K,int R,Kokkos::View<const int*> offsets) {
+  Kokkos::View<Scalar*> output("Output",N);
+  Kokkos::Impl::Timer timer;
+
+  for(int r = 0; r<R; r++)
+  Kokkos::parallel_for(L, KOKKOS_LAMBDA (const int&i) {
+    Scalar s = 2;
+    for(int m=0;m<M;m++) {
+      for(int k=0;k<K;k++)
+        s=s*s+s;
+      const int idx = (i+offsets(i,m))%N;
+      Kokkos::atomic_add(&output(idx),s);
+    }
+  }); 
+  Kokkos::fence();
+  double time = timer.seconds();
+ 
+  return time;
+}
+
+template<class Scalar>
+double test_no_atomic(int L, int N, int M,int K,int R,Kokkos::View<const int*> offsets) {
+  Kokkos::View<Scalar*> output("Output",N);
+  Kokkos::Impl::Timer timer;
+  for(int r = 0; r<R; r++)
+  Kokkos::parallel_for(L, KOKKOS_LAMBDA (const int&i) {
+    Scalar s = 2;
+    for(int m=0;m<M;m++) {
+      for(int k=0;k<K;k++)
+        s=s*s+s;
+      const int idx = (i+offsets(i,m))%N;
+      output(idx) += s;
+    }
+  });
+  Kokkos::fence();
+  double time =  timer.seconds();
+  return time;
+}
+
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc,argv);
+{
+  if(argc<8) {
+    printf("Arguments: L N M D K R T\n");
+    printf("  L:   Number of iterations to run\n");
+    printf("  N:   Length of array to do atomics into\n");
+    printf("  M:   Number of atomics per iteration to do\n");
+    printf("  D:   Distance from index i to do atomics into (randomly)\n");
+    printf("  K:   Number of FMAD per atomic\n");
+    printf("  R:   Number of repeats of the experiments\n");
+    printf("  T:   Type of atomic\n");
+    printf("       1 - int\n");
+    printf("       2 - long\n");
+    printf("       3 - float\n");
+    printf("       4 - double\n");
+    printf("       5 - complex<double>\n");
+    printf("Example Input GPU:\n");
+    printf("  Histogram : 1000000 1000 1 1000 1 10 1\n");
+    printf("  MD Force : 100000 100000 100 1000 20 10 4\n");
+    printf("  Matrix Assembly : 100000 1000000 50 1000 20 10 4\n");
+    Kokkos::finalize();
+    return 0;
+  }
+
+
+  int L = atoi(argv[1]);
+  int N = atoi(argv[2]);
+  int M = atoi(argv[3]);
+  int D = atoi(argv[4]); 
+  int K = atoi(argv[5]);
+  int R = atoi(argv[6]); 
+  int type = atoi(argv[7]);
+ 
+  Kokkos::View<int*> offsets("Offsets",L,M);
+  Kokkos::Random_XorShift64_Pool<> pool(12371);
+  Kokkos::fill_random(offsets,pool,D);
+  double time = 0;
+  if(type==1)
+    time  = test_atomic<int>(L,N,M,K,R,offsets);
+  if(type==2)
+    time = test_atomic<long>(L,N,M,K,R,offsets);
+  if(type==3)
+    time = test_atomic<float>(L,N,M,K,R,offsets);
+  if(type==4)
+    time = test_atomic<double>(L,N,M,K,R,offsets);
+  if(type==5)
+    time = test_atomic<Kokkos::complex<double> >(L,N,M,K,R,offsets);
+
+  double time2 = 1;
+  if(type==1)
+    time2 = test_no_atomic<int>(L,N,M,K,R,offsets);
+  if(type==2)
+    time2 = test_no_atomic<long>(L,N,M,K,R,offsets);
+  if(type==3)
+    time2 = test_no_atomic<float>(L,N,M,K,R,offsets);
+  if(type==4)
+    time2 = test_no_atomic<double>(L,N,M,K,R,offsets);
+  if(type==5)
+    time2 = test_no_atomic<Kokkos::complex<double> >(L,N,M,K,R,offsets);
+
+  int size = 0;
+  if(type==1) size = sizeof(int);
+  if(type==2) size = sizeof(long);
+  if(type==3) size = sizeof(float);
+  if(type==4) size = sizeof(double);
+  if(type==5) size = sizeof(Kokkos::complex<double>);
+
+  printf("%i\n",size);
+  printf("Time: %s %i %i %i %i %i %i (t_atomic: %e t_nonatomic: %e ratio: %lf )( GUpdates/s: %lf GB/s: %lf )\n",
+    (type==1)?"int": (
+    (type==2)?"long": (
+    (type==3)?"float": (
+    (type==4)?"double":"complex"))),
+    L,N,M,D,K,R,time,time2,time/time2,
+    1.e-9*L*R*M/time, 1.0*L*R*M*2*size/time/1024/1024/1024);
+}
+  Kokkos::finalize();
+}
+
diff --git a/packages/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash b/packages/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash
new file mode 100755
index 0000000000000000000000000000000000000000..0b885293e27ad4810ecc89e2f0ffdc4cc61b2f2f
--- /dev/null
+++ b/packages/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# ---- Default Settings -----
+
+# Paths
+KOKKOS_PATH=${PWD}/kokkos
+KOKKOS_KERNELS_PATH=${PWD}/kokkos-kernels
+MINIMD_PATH=${PWD}/miniMD/kokkos
+MINIFE_PATH=${PWD}/miniFE/kokkos
+
+# Kokkos Configure Options
+KOKKOS_DEVICES=OpenMP
+KOKKOS_ARCH=SNB
+
+# Compiler Options
+CXX=mpicxx
+OPT_FLAG="-O3"
+
+while [[ $# > 0 ]]
+do
+  key="$1"
+
+  case $key in
+    --kokkos-path*)
+      KOKKOS_PATH="${key#*=}"
+      ;;
+    --kokkos-kernels-path*)
+      KOKKOS_KERNELS_PATH="${key#*=}"
+      ;;
+    --minimd-path*)
+      MINIMD_PATH="${key#*=}"
+      ;;
+    --minife-path*)
+      MINIFE_PATH="${key#*=}"
+      ;;
+    --device-list*)
+      KOKKOS_DEVICES="${key#*=}"
+      ;;
+    --arch*)
+      KOKKOS_ARCH="--arch=${key#*=}"
+      ;;
+    --opt-flag*)
+      OPT_FLAG="${key#*=}"
+      ;;
+    --compiler*)
+      CXX="${key#*=}"
+      ;;
+    --with-cuda-options*)
+      KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
+      ;;
+    --help*)
+      PRINT_HELP=True
+      ;;
+    *)
+      # args, just append
+      ARGS="$ARGS $1"
+      ;;
+  esac
+
+  shift
+done
+
+mkdir build
+
+# Build BytesAndFlops
+mkdir build/bytes_and_flops
+cd build/bytes_and_flops
+make KOKKOS_ARCH=${KOKKOS_ARCH} KOKKOS_DEVICES=${KOKKOS_DEVICES} CXX=${CXX} KOKKOS_PATH=${KOKKOS_PATH}\
+     CXXFLAGS=${OPT_FLAG} -f ${KOKKOS_PATH}/benchmarks/bytes_and_flops/Makefile -j 16
+cd ../..
+
+mkdir build/miniMD
+cd build/miniMD
+make KOKKOS_ARCH=${KOKKOS_ARCH} KOKKOS_DEVICES=${KOKKOS_DEVICES} CXX=${CXX} KOKKOS_PATH=${KOKKOS_PATH} \
+     CXXFLAGS=${OPT_FLAG} -f ${MINIMD_PATH}/Makefile -j 16
+cd ../../
+
+mkdir build/miniFE
+cd build/miniFE
+make KOKKOS_ARCH=${KOKKOS_ARCH} KOKKOS_DEVICES=${KOKKOS_DEVICES} CXX=${CXX} KOKKOS_PATH=${KOKKOS_PATH} \
+     CXXFLAGS=${OPT_FLAG} -f ${MINIFE_PATH}/src/Makefile -j 16
+cd ../../
+
+
diff --git a/packages/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash b/packages/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash
new file mode 100755
index 0000000000000000000000000000000000000000..9b52a36d89ac107297c9c0501027c946b0cdf55a
--- /dev/null
+++ b/packages/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Kokkos
+if [ ! -d "kokkos" ]; then
+  git clone https://github.com/kokkos/kokkos
+fi
+cd kokkos
+git checkout develop
+git pull
+cd ..
+
+# KokkosKernels
+if [ ! -d "kokkos-kernels" ]; then
+git clone https://github.com/kokkos/kokkos-kernels
+fi
+cd kokkos-kernels
+git pull
+cd ..
+
+# MiniMD
+if [ ! -d "miniMD" ]; then
+  git clone https://github.com/mantevo/miniMD
+fi
+cd miniMD
+git pull
+cd ..
+
+# MiniFE
+if [ ! -d "miniFE" ]; then
+  git clone https://github.com/mantevo/miniFE
+fi
+cd miniFE
+git pull
+cd ..
+
+
+
diff --git a/packages/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash b/packages/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash
new file mode 100755
index 0000000000000000000000000000000000000000..6afa05f5fcfbdd1ff3a55b529edd8d52d92bf2d8
--- /dev/null
+++ b/packages/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash
@@ -0,0 +1,14 @@
+#!/bin/bash
+SCRIPT_PATH=$1
+KOKKOS_DEVICES=$2
+KOKKOS_ARCH=$3
+COMPILER=$4
+if [[ $# < 4 ]]; then
+  echo "Usage: ./run_benchmark.bash PATH_TO_SCRIPTS KOKKOS_DEVICES KOKKOS_ARCH COMPILER"
+else
+
+${SCRIPT_PATH}/checkout_repos.bash
+${SCRIPT_PATH}/build_code.bash --arch=${KOKKOS_ARCH} --device-list=${KOKKOS_DEVICES} --compiler=${COMPILER}
+${SCRIPT_PATH}/run_tests.bash
+
+fi
\ No newline at end of file
diff --git a/packages/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash b/packages/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash
new file mode 100755
index 0000000000000000000000000000000000000000..9dded535e8bfe20677d302d8b4f8e2cca1bf02ea
--- /dev/null
+++ b/packages/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# BytesAndFlops
+cd build/bytes_and_flops
+
+USE_CUDA=`grep "_CUDA" KokkosCore_config.h | wc -l`
+
+if [[ ${USE_CUDA} > 0 ]]; then
+  BAF_EXE=bytes_and_flops.cuda
+  TEAM_SIZE=256
+else
+  BAF_EXE=bytes_and_flops.host
+  TEAM_SIZE=1
+fi
+
+BAF_PERF_1=`./${BAF_EXE} 2 100000 1024 1 1 1 1 ${TEAM_SIZE} 6000 | awk '{print $12/174.5}'`
+BAF_PERF_2=`./${BAF_EXE} 2 100000 1024 16 1 8 64 ${TEAM_SIZE} 6000 | awk '{print $14/1142.65}'`
+
+echo "BytesAndFlops: ${BAF_PERF_1} ${BAF_PERF_2}"
+cd ../..
+
+
+# MiniMD
+cd build/miniMD
+cp ../../miniMD/kokkos/Cu_u6.eam ./
+MD_PERF_1=`./miniMD --half_neigh 0 -s 60 --ntypes 1 -t ${OMP_NUM_THREADS} -i ../../miniMD/kokkos/in.eam.miniMD | grep PERF_SUMMARY | awk '{print $10/21163341}'`
+MD_PERF_2=`./miniMD --half_neigh 0 -s 20 --ntypes 1 -t ${OMP_NUM_THREADS} -i ../../miniMD/kokkos/in.eam.miniMD | grep PERF_SUMMARY | awk '{print $10/13393417}'`
+
+echo "MiniMD: ${MD_PERF_1} ${MD_PERF_2}"
+cd ../..
+
+# MiniFE
+cd build/miniFE
+rm *.yaml
+./miniFE.x -nx 100 &> /dev/null
+FE_PERF_1=`grep "CG Mflop" *.yaml | awk '{print $4/14174}'`
+rm *.yaml
+./miniFE.x -nx 50 &> /dev/null
+FE_PERF_2=`grep "CG Mflop" *.yaml | awk '{print $4/11897}'`
+cd ../..
+echo "MiniFE: ${FE_PERF_1} ${FE_PERF_2}"
+
+PERF_RESULT=`echo "${BAF_PERF_1} ${BAF_PERF_2} ${MD_PERF_1} ${MD_PERF_2} ${FE_PERF_1} ${FE_PERF_2}" | awk '{print ($1+$2+$3+$4+$5+$6)/6}'`
+echo "Total Result: " ${PERF_RESULT}
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/Makefile b/packages/kokkos/benchmarks/bytes_and_flops/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..6cbef56ff066101b3aa7bba2094bd14b93757f80
--- /dev/null
+++ b/packages/kokkos/benchmarks/bytes_and_flops/Makefile
@@ -0,0 +1,51 @@
+KOKKOS_DEVICES=Cuda
+KOKKOS_CUDA_OPTIONS=enable_lambda
+KOKKOS_ARCH = "SNB,Kepler35"
+
+
+MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST))))
+
+ifndef KOKKOS_PATH
+  KOKKOS_PATH = $(MAKEFILE_PATH)../..
+endif
+
+SRC = $(wildcard $(MAKEFILE_PATH)*.cpp)
+HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp)
+
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+EXE = bytes_and_flops.cuda
+else
+CXX = g++
+EXE = bytes_and_flops.host
+endif
+
+CXXFLAGS ?= -O3 -g
+override CXXFLAGS += -I$(MAKEFILE_PATH)
+
+DEPFLAGS = -M
+LINK = ${CXX}
+LINKFLAGS =
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..59b4d50c441eb5d4c58c99c53eee7f9c6b9adbd1
--- /dev/null
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp
@@ -0,0 +1,99 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<Kokkos_Core.hpp>
+#include<impl/Kokkos_Timer.hpp>
+
+template<class Scalar, int Unroll,int Stride>
+struct Run {
+static void run(int N, int K, int R, int F, int T, int S);
+};
+
+template<class Scalar, int Stride>
+struct RunStride {
+static void run_1(int N, int K, int R, int F, int T, int S);
+static void run_2(int N, int K, int R, int F, int T, int S);
+static void run_3(int N, int K, int R, int F, int T, int S);
+static void run_4(int N, int K, int R, int F, int T, int S);
+static void run_5(int N, int K, int R, int F, int T, int S);
+static void run_6(int N, int K, int R, int F, int T, int S);
+static void run_7(int N, int K, int R, int F, int T, int S);
+static void run_8(int N, int K, int R, int F, int T, int S);
+static void run(int N, int K, int R, int U, int F, int T, int S);
+};
+
+#define STRIDE 1
+#include<bench_stride.hpp>
+#undef STRIDE
+#define STRIDE 2
+#include<bench_stride.hpp>
+#undef STRIDE
+#define STRIDE 4
+#include<bench_stride.hpp>
+#undef STRIDE
+#define STRIDE 8
+#include<bench_stride.hpp>
+#undef STRIDE
+#define STRIDE 16
+#include<bench_stride.hpp>
+#undef STRIDE
+#define STRIDE 32
+#include<bench_stride.hpp>
+#undef STRIDE
+
+template<class Scalar>
+void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S) {
+ if(D == 1)
+   RunStride<Scalar,1>::run(N,K,R,U,F,T,S);
+ if(D == 2)
+   RunStride<Scalar,2>::run(N,K,R,U,F,T,S);
+ if(D == 4)
+   RunStride<Scalar,4>::run(N,K,R,U,F,T,S);
+ if(D == 8)
+   RunStride<Scalar,8>::run(N,K,R,U,F,T,S);
+ if(D == 16)
+   RunStride<Scalar,16>::run(N,K,R,U,F,T,S);
+ if(D == 32)
+   RunStride<Scalar,32>::run(N,K,R,U,F,T,S);
+}
+
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6509c654e71b4fdfd5366e4699972d7d19fe7807
--- /dev/null
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp
@@ -0,0 +1,124 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#define UNROLL 1
+#include<bench_unroll_stride.hpp>
+#undef UNROLL
+#define UNROLL 2
+#include<bench_unroll_stride.hpp>
+#undef UNROLL
+#define UNROLL 3
+#include<bench_unroll_stride.hpp>
+#undef UNROLL
+#define UNROLL 4
+#include<bench_unroll_stride.hpp>
+#undef UNROLL
+#define UNROLL 5
+#include<bench_unroll_stride.hpp>
+#undef UNROLL
+#define UNROLL 6
+#include<bench_unroll_stride.hpp>
+#undef UNROLL
+#define UNROLL 7
+#include<bench_unroll_stride.hpp>
+#undef UNROLL
+#define UNROLL 8
+#include<bench_unroll_stride.hpp>
+#undef UNROLL
+
+template<class Scalar>
+struct RunStride<Scalar,STRIDE> {
+static void run_1(int N, int K, int R, int F, int T, int S) {
+  Run<Scalar,1,STRIDE>::run(N,K,R,F,T,S);
+}
+static void run_2(int N, int K, int R, int F, int T, int S) {
+  Run<Scalar,2,STRIDE>::run(N,K,R,F,T,S);
+}
+static void run_3(int N, int K, int R, int F, int T, int S) {
+  Run<Scalar,3,STRIDE>::run(N,K,R,F,T,S);
+}
+static void run_4(int N, int K, int R, int F, int T, int S) {
+  Run<Scalar,4,STRIDE>::run(N,K,R,F,T,S);
+}
+static void run_5(int N, int K, int R, int F, int T, int S) {
+  Run<Scalar,5,STRIDE>::run(N,K,R,F,T,S);
+}
+static void run_6(int N, int K, int R, int F, int T, int S) {
+  Run<Scalar,6,STRIDE>::run(N,K,R,F,T,S);
+}
+static void run_7(int N, int K, int R, int F, int T, int S) {
+  Run<Scalar,7,STRIDE>::run(N,K,R,F,T,S);
+}
+static void run_8(int N, int K, int R, int F, int T, int S) {
+  Run<Scalar,8,STRIDE>::run(N,K,R,F,T,S);
+}
+
+static void run(int N, int K, int R, int U, int F, int T, int S) {
+  if(U==1) {
+    run_1(N,K,R,F,T,S);
+  }
+  if(U==2) {
+    run_2(N,K,R,F,T,S);
+  }
+  if(U==3) {
+    run_3(N,K,R,F,T,S);
+  }
+  if(U==4) {
+    run_4(N,K,R,F,T,S);
+  }
+  if(U==5) {
+    run_5(N,K,R,F,T,S);
+  }
+  if(U==6) {
+    run_6(N,K,R,F,T,S);
+  }
+  if(U==7) {
+    run_7(N,K,R,F,T,S);
+  }
+  if(U==8) {
+    run_8(N,K,R,F,T,S);
+  } 
+}
+};
+
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6651da1e7cf2e6d1a233dc90e9adf1211decf69
--- /dev/null
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp
@@ -0,0 +1,148 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+template<class Scalar>
+struct Run<Scalar,UNROLL,STRIDE> {
+static void run(int N, int K, int R, int F, int T, int S) {
+  Kokkos::View<Scalar**[STRIDE],Kokkos::LayoutRight> A("A",N,K);
+  Kokkos::View<Scalar**[STRIDE],Kokkos::LayoutRight> B("B",N,K);
+  Kokkos::View<Scalar**[STRIDE],Kokkos::LayoutRight> C("C",N,K);
+
+  Kokkos::deep_copy(A,Scalar(1.5));
+  Kokkos::deep_copy(B,Scalar(2.5));
+  Kokkos::deep_copy(C,Scalar(3.5));
+
+  Kokkos::Timer timer;
+  Kokkos::parallel_for("BenchmarkKernel",Kokkos::TeamPolicy<>(N,T).set_scratch_size(0,Kokkos::PerTeam(S)),
+    KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type& team) {
+    const int n = team.league_rank();
+    for(int r=0; r<R; r++) {
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,K), [&] (const int& i) {
+        Scalar a1 = A(n,i,0); 
+        const Scalar b = B(n,i,0);
+#if(UNROLL>1)
+        Scalar a2 = a1*1.3;
+#endif
+#if(UNROLL>2)
+        Scalar a3 = a2*1.1;
+#endif
+#if(UNROLL>3)
+        Scalar a4 = a3*1.1;
+#endif
+#if(UNROLL>4)
+        Scalar a5 = a4*1.3;
+#endif
+#if(UNROLL>5)
+        Scalar a6 = a5*1.1;
+#endif
+#if(UNROLL>6)
+        Scalar a7 = a6*1.1;
+#endif
+#if(UNROLL>7)
+        Scalar a8 = a7*1.1;
+#endif
+
+
+        for(int f = 0; f<F; f++) {
+          a1 += b*a1;
+#if(UNROLL>1)
+          a2 += b*a2;
+#endif
+#if(UNROLL>2)
+          a3 += b*a3;
+#endif
+#if(UNROLL>3)
+          a4 += b*a4;
+#endif
+#if(UNROLL>4)
+          a5 += b*a5;
+#endif
+#if(UNROLL>5)
+          a6 += b*a6;
+#endif
+#if(UNROLL>6)
+          a7 += b*a7;
+#endif
+#if(UNROLL>7)
+          a8 += b*a8;
+#endif
+
+
+        }
+#if(UNROLL==1)
+        C(n,i,0) = a1; 
+#endif
+#if(UNROLL==2)
+        C(n,i,0) = a1+a2; 
+#endif
+#if(UNROLL==3)
+        C(n,i,0) = a1+a2+a3; 
+#endif
+#if(UNROLL==4)
+        C(n,i,0) = a1+a2+a3+a4; 
+#endif
+#if(UNROLL==5)
+        C(n,i,0) = a1+a2+a3+a4+a5;
+#endif
+#if(UNROLL==6)
+        C(n,i,0) = a1+a2+a3+a4+a5+a6;
+#endif
+#if(UNROLL==7)
+        C(n,i,0) = a1+a2+a3+a4+a5+a6+a7;
+#endif
+#if(UNROLL==8)
+        C(n,i,0) = a1+a2+a3+a4+a5+a6+a7+a8;
+#endif
+
+      });
+    }
+  });
+  Kokkos::fence(); 
+  double seconds = timer.seconds();
+
+  double bytes = 1.0*N*K*R*3*sizeof(Scalar);
+  double flops = 1.0*N*K*R*(F*2*UNROLL + 2*(UNROLL-1));
+  printf("NKRUFTS: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: %lf\n",N,K,R,UNROLL,F,T,S,seconds,1.0*bytes/seconds/1024/1024/1024,1.e-9*flops/seconds);
+}
+};
+
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f46b38717df237f0d33c0ac101d105f41f3e9d8
--- /dev/null
+++ b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp
@@ -0,0 +1,97 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<Kokkos_Core.hpp>
+#include<impl/Kokkos_Timer.hpp>
+#include<bench.hpp>
+#include<cstdlib>
+
+int main(int argc, char* argv[]) {
+  Kokkos::initialize();
+
+
+  if(argc<10) {
+    printf("Arguments: N K R D U F T S\n");
+    printf("  P:   Precision (1==float, 2==double)\n");
+    printf("  N,K: dimensions of the 2D array to allocate\n");
+    printf("  R:   how often to loop through the K dimension with each team\n");
+    printf("  D:   distance between loaded elements (stride)\n");
+    printf("  U:   how many independent flops to do per load\n");
+    printf("  F:   how many times to repeat the U unrolled operations before reading next element\n");
+    printf("  T:   team size\n");
+    printf("  S:   shared memory per team (used to control occupancy on GPUs)\n");
+    printf("Example Input GPU:\n");
+    printf("  Bandwidth Bound : 2 100000 1024 1 1 1 1 256 6000\n");
+    printf("  Cache Bound     : 2 100000 1024 64 1 1 1 512 20000\n");
+    printf("  Compute Bound   : 2 100000 1024 1 1 8 64 256 6000\n");
+    printf("  Load Slots Used : 2 20000 256 32 16 1 1 256 6000\n");
+    printf("  Inefficient Load: 2 20000 256 32 2 1 1 256 20000\n");
+    Kokkos::finalize();
+    return 0;
+  }
+
+
+  int P = atoi(argv[1]);
+  int N = atoi(argv[2]);
+  int K = atoi(argv[3]);
+  int R = atoi(argv[4]);
+  int D = atoi(argv[5]);
+  int U = atoi(argv[6]);
+  int F = atoi(argv[7]);
+  int T = atoi(argv[8]);
+  int S = atoi(argv[9]);
+
+  if(U>8) {printf("U must be 1-8\n"); return 0;}
+  if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;}
+  if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;}
+
+  if(P==1) {
+    run_stride_unroll<float>(N,K,R,D,U,F,T,S);
+  }
+  if(P==2) {
+    run_stride_unroll<double>(N,K,R,D,U,F,T,S);
+  }
+
+  Kokkos::finalize();
+}
+
diff --git a/packages/kokkos/benchmarks/gather/Makefile b/packages/kokkos/benchmarks/gather/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..0ea9fb1dd27b47d0d35d578f9ff1fa862e20a8c6
--- /dev/null
+++ b/packages/kokkos/benchmarks/gather/Makefile
@@ -0,0 +1,44 @@
+KOKKOS_PATH = ${HOME}/kokkos
+SRC = $(wildcard *.cpp)
+KOKKOS_DEVICES=Cuda
+KOKKOS_CUDA_OPTIONS=enable_lambda
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+EXE = gather.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+EXE = gather.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+CXXFLAGS = -O3 -g
+
+DEPFLAGS = -M
+LINK = ${CXX}
+LINKFLAGS =
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+$(warning ${KOKKOS_CPPFLAGS})
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) gather_unroll.hpp gather.hpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/packages/kokkos/benchmarks/gather/gather.hpp b/packages/kokkos/benchmarks/gather/gather.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bbbd65850f75c00a928383d3c6ae2f9af27d7d95
--- /dev/null
+++ b/packages/kokkos/benchmarks/gather/gather.hpp
@@ -0,0 +1,92 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+template<class Scalar, int UNROLL>
+struct RunGather {
+  static void run(int N, int K, int D, int R, int F);
+};
+
+#define UNROLL 1
+#include<gather_unroll.hpp>
+#undef UNROLL
+#define UNROLL 2
+#include<gather_unroll.hpp>
+#undef UNROLL
+#define UNROLL 3
+#include<gather_unroll.hpp>
+#undef UNROLL
+#define UNROLL 4
+#include<gather_unroll.hpp>
+#undef UNROLL
+#define UNROLL 5
+#include<gather_unroll.hpp>
+#undef UNROLL
+#define UNROLL 6
+#include<gather_unroll.hpp>
+#undef UNROLL
+#define UNROLL 7
+#include<gather_unroll.hpp>
+#undef UNROLL
+#define UNROLL 8
+#include<gather_unroll.hpp>
+#undef UNROLL
+
+template<class Scalar>
+void run_gather_test(int N, int K, int D, int R, int U, int F) {
+ if(U == 1)
+   RunGather<Scalar,1>::run(N,K,D,R,F);
+ if(U == 2)
+   RunGather<Scalar,2>::run(N,K,D,R,F);
+ if(U == 3)
+   RunGather<Scalar,3>::run(N,K,D,R,F);
+ if(U == 4)
+   RunGather<Scalar,4>::run(N,K,D,R,F);
+ if(U == 5)
+   RunGather<Scalar,5>::run(N,K,D,R,F);
+ if(U == 6)
+   RunGather<Scalar,6>::run(N,K,D,R,F);
+ if(U == 7)
+   RunGather<Scalar,7>::run(N,K,D,R,F);
+ if(U == 8)
+   RunGather<Scalar,8>::run(N,K,D,R,F);
+}
diff --git a/packages/kokkos/benchmarks/gather/gather_unroll.hpp b/packages/kokkos/benchmarks/gather/gather_unroll.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d9c99adf9e9909a21be2899f60000713e0208a4
--- /dev/null
+++ b/packages/kokkos/benchmarks/gather/gather_unroll.hpp
@@ -0,0 +1,169 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<Kokkos_Core.hpp>
+#include<Kokkos_Random.hpp>
+
+template<class Scalar>
+struct RunGather<Scalar,UNROLL> {
+static void run(int N, int K, int D, int R, int F) {
+  Kokkos::View<int**> connectivity("Connectivity",N,K);
+  Kokkos::View<Scalar*> A_in("Input",N);
+  Kokkos::View<Scalar*> B_in("Input",N);
+  Kokkos::View<Scalar*> C("Output",N);
+
+  Kokkos::Random_XorShift64_Pool<> rand_pool(12313);
+
+  Kokkos::deep_copy(A_in,1.5);
+  Kokkos::deep_copy(B_in,2.0);
+
+  Kokkos::View<const Scalar*, Kokkos::MemoryTraits<Kokkos::RandomAccess> > A(A_in);
+  Kokkos::View<const Scalar*, Kokkos::MemoryTraits<Kokkos::RandomAccess> > B(B_in);
+
+  Kokkos::parallel_for("InitKernel",N,
+      KOKKOS_LAMBDA (const int& i) {
+    auto rand_gen = rand_pool.get_state();
+    for( int jj=0; jj<K; jj++) {
+      connectivity(i,jj) = (rand_gen.rand(D) + i - D/2 + N)%N;
+    }
+    rand_pool.free_state(rand_gen);
+  });
+  Kokkos::fence();
+
+
+  Kokkos::Timer timer;
+  for(int r = 0; r<R; r++) {
+  Kokkos::parallel_for("BenchmarkKernel",N,
+      KOKKOS_LAMBDA (const int& i) {
+      Scalar c = Scalar(0.0);
+      for( int jj=0; jj<K; jj++) {
+        const int j = connectivity(i,jj);
+        Scalar a1 = A(j);
+        const Scalar b = B(j);
+#if(UNROLL>1)
+        Scalar a2 = a1*Scalar(1.3);
+#endif
+#if(UNROLL>2)
+        Scalar a3 = a2*Scalar(1.1);
+#endif
+#if(UNROLL>3)
+        Scalar a4 = a3*Scalar(1.1);
+#endif
+#if(UNROLL>4)
+        Scalar a5 = a4*Scalar(1.3);
+#endif
+#if(UNROLL>5)
+        Scalar a6 = a5*Scalar(1.1);
+#endif
+#if(UNROLL>6)
+        Scalar a7 = a6*Scalar(1.1);
+#endif
+#if(UNROLL>7)
+        Scalar a8 = a7*Scalar(1.1);
+#endif
+
+
+        for(int f = 0; f<F; f++) {
+          a1 += b*a1;
+#if(UNROLL>1)
+          a2 += b*a2;
+#endif
+#if(UNROLL>2)
+          a3 += b*a3;
+#endif
+#if(UNROLL>3)
+          a4 += b*a4;
+#endif
+#if(UNROLL>4)
+          a5 += b*a5;
+#endif
+#if(UNROLL>5)
+          a6 += b*a6;
+#endif
+#if(UNROLL>6)
+          a7 += b*a7;
+#endif
+#if(UNROLL>7)
+          a8 += b*a8;
+#endif
+
+
+        }
+#if(UNROLL==1)
+        c += a1;
+#endif
+#if(UNROLL==2)
+        c += a1+a2;
+#endif
+#if(UNROLL==3)
+        c += a1+a2+a3;
+#endif
+#if(UNROLL==4)
+        c += a1+a2+a3+a4;
+#endif
+#if(UNROLL==5)
+        c += a1+a2+a3+a4+a5;
+#endif
+#if(UNROLL==6)
+        c += a1+a2+a3+a4+a5+a6;
+#endif
+#if(UNROLL==7)
+        c += a1+a2+a3+a4+a5+a6+a7;
+#endif
+#if(UNROLL==8)
+        c += a1+a2+a3+a4+a5+a6+a7+a8;
+#endif
+
+      }
+      C(i) = c ;
+  });
+  Kokkos::fence();
+  }
+  double seconds = timer.seconds();
+
+  double bytes = 1.0*N*K*R*(2*sizeof(Scalar)+sizeof(int)) + 1.0*N*R*sizeof(Scalar);
+  double flops = 1.0*N*K*R*(F*2*UNROLL + 2*(UNROLL-1));
+  double gather_ops = 1.0*N*K*R*2;
+  printf("SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: %lf GGather/s: %lf\n",sizeof(Scalar)/4,N,K,D,R,UNROLL,F,seconds,1.0*bytes/seconds/1024/1024/1024,1.e-9*flops/seconds,1.e-9*gather_ops/seconds);
+}
+};
diff --git a/packages/kokkos/benchmarks/gather/main.cpp b/packages/kokkos/benchmarks/gather/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ca5238e7fdb527e56080f5b76eb6fa7c9487fcd3
--- /dev/null
+++ b/packages/kokkos/benchmarks/gather/main.cpp
@@ -0,0 +1,93 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<Kokkos_Core.hpp>
+#include<impl/Kokkos_Timer.hpp>
+#include<gather.hpp>
+#include<cstdlib>
+
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc,argv);
+
+  if(argc<8) {
+    printf("Arguments: S N K D\n");
+    printf("  S:   Scalar Type Size (1==float, 2==double, 4=complex<double>)\n");
+    printf("  N:   Number of entities\n");
+    printf("  K:   Number of things to gather per entity\n");
+    printf("  D:   Max distance of gathered things of an entity\n");
+    printf("  R:   how often to loop through the K dimension with each team\n");
+    printf("  U:   how many independent flops to do per load\n");
+    printf("  F:   how many times to repeat the U unrolled operations before reading next element\n");
+    printf("Example Input GPU:\n");
+    printf("  Bandwidth Bound : 2 10000000 1 1 10 1 1\n");
+    printf("  Cache Bound     : 2 10000000 64 1 10 1 1\n");
+    printf("  Cache Gather    : 2 10000000 64 256 10 1 1\n");
+    printf("  Global Gather   : 2 100000000 16 100000000 1 1 1\n");
+    printf("  Typical MD      : 2 100000 32 512 1000 8 2\n");
+    Kokkos::finalize();
+    return 0;
+  }
+
+
+  int S = atoi(argv[1]);
+  int N = atoi(argv[2]);
+  int K = atoi(argv[3]);
+  int D = atoi(argv[4]);
+  int R = atoi(argv[5]);
+  int U = atoi(argv[6]);
+  int F = atoi(argv[7]);
+
+  if( (S!=1) && (S!=2) && (S!=4)) {printf("S must be one of 1,2,4\n"); return 0;}
+  if( N<D ) {printf("N must be larger or equal to D\n"); return 0; }
+  if(S==1) {
+    run_gather_test<float>(N,K,D,R,U,F);
+  }
+  if(S==2) {
+    run_gather_test<double>(N,K,D,R,U,F);
+  }
+  if(S==4) {
+    run_gather_test<Kokkos::complex<double> >(N,K,D,R,U,F);
+  }
+  Kokkos::finalize();
+}
+
diff --git a/packages/kokkos/benchmarks/policy_performance/Makefile b/packages/kokkos/benchmarks/policy_performance/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..13aef3209cace8419138d946a919eb893ed9a8d2
--- /dev/null
+++ b/packages/kokkos/benchmarks/policy_performance/Makefile
@@ -0,0 +1,44 @@
+KOKKOS_PATH = ../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3 -g
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = policy_performance.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS+=enable_lambda
+else
+CXX = g++
+CXXFLAGS = -O3 -g -Wall -Werror
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = policy_performance.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/packages/kokkos/benchmarks/policy_performance/main.cpp b/packages/kokkos/benchmarks/policy_performance/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f5395734afdcfce41cc43c9b2322de4615a74b1
--- /dev/null
+++ b/packages/kokkos/benchmarks/policy_performance/main.cpp
@@ -0,0 +1,170 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include "policy_perf_test.hpp"
+
+int main(int argc, char* argv[] ) {
+  Kokkos::initialize(argc,argv);
+
+  if(argc<10) {
+    printf("  Ten arguments are needed to run this program:\n");
+    printf("    (1)team_range, (2)thread_range, (3)vector_range, (4)outer_repeat, (5)thread_repeat, (6)vector_repeat, (7)team_size, (8)vector_size, (9)schedule, (10)test_type\n");
+    printf("  team_range:     number of teams (league_size)\n");
+    printf("  thread_range:   range for nested TeamThreadRange parallel_*\n");
+    printf("  vector_range:   range for nested ThreadVectorRange parallel_*\n");
+    printf("  outer_repeat:   number of repeats for outer parallel_* call\n");
+    printf("  thread_repeat:  number of repeats for TeamThreadRange parallel_* call\n");
+    printf("  vector_repeat:  number of repeats for ThreadVectorRange parallel_* call\n");
+    printf("  team_size:      number of team members (team_size)\n");
+    printf("  vector_size:    desired vectorization (if possible)\n");
+    printf("  schedule:       1 == Static  2 == Dynamic\n");
+    printf("  test_type:      3-digit code XYZ for testing (nested) parallel_*\n");
+    printf("  code key:       XYZ    X in {1,2,3,4,5}, Y in {0,1,2}, Z in {0,1,2}\n");
+    printf("                  TeamPolicy:\n");
+    printf("                    X: 0 = none (never used, makes no sense); 1 = parallel_for; 2 = parallel_reduce\n");
+    printf("                    Y: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
+    printf("                    Z: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
+    printf("                  RangePolicy:\n");
+    printf("                    X: 3 = parallel_for; 4 = parallel_reduce; 5 = parallel_scan\n");
+    printf("                    Y: 0 = none\n");
+    printf("                    Z: 0 = none\n");
+    printf("  Example Input:\n");
+    printf("  100000 32 32 100 100 100 8 1 1 100\n"); 
+    Kokkos::finalize();
+    return 0;
+  }
+
+  int team_range = atoi(argv[1]);
+  int thread_range = atoi(argv[2]);
+  int vector_range = atoi(argv[3]);
+
+  int outer_repeat = atoi(argv[4]);
+  int thread_repeat = atoi(argv[5]);
+  int vector_repeat = atoi(argv[6]);
+
+  int team_size = atoi(argv[7]);
+  int vector_size = atoi(argv[8]);
+  int schedule = atoi(argv[9]);
+  int test_type = atoi(argv[10]);
+
+  int disable_verbose_output = 0; 
+  if ( argc > 11 ) {
+    disable_verbose_output = atoi(argv[11]);
+  }
+
+  if ( schedule != 1 && schedule != 2 ) {
+    printf("schedule: %d\n", schedule);
+    printf("Options for schedule are: 1 == Static  2 == Dynamic\n");
+    Kokkos::finalize();
+    return -1;
+  }
+
+  if ( test_type != 100 && test_type != 110 && test_type != 111 && test_type != 112 && test_type != 120  && test_type != 121  && test_type != 122
+     && test_type != 200 && test_type != 210 && test_type != 211 && test_type != 212 && test_type != 220  && test_type != 221  && test_type != 222
+     && test_type != 300 && test_type != 400 && test_type != 500
+     )
+  {
+    printf("Incorrect test_type option\n");
+    Kokkos::finalize();
+    return -2;
+  }
+
+  double result = 0.0;
+
+  Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10,1), 
+    KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, double& lval) {
+      lval += 1;
+    }, result);
+
+  typedef Kokkos::View<double*, Kokkos::LayoutRight>   view_type_1d;
+  typedef Kokkos::View<double**, Kokkos::LayoutRight>  view_type_2d;
+  typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d;
+
+  // Allocate view without initializing
+  // Call a 'warmup' test with 1 repeat - this will initialize the corresponding view appropriately for test and should obey first-touch etc
+  // Second call to test is the one we actually care about and time
+  view_type_1d v_1( Kokkos::ViewAllocateWithoutInitializing("v_1"), team_range*team_size);
+  view_type_2d v_2( Kokkos::ViewAllocateWithoutInitializing("v_2"), team_range*team_size, thread_range);
+  view_type_3d v_3( Kokkos::ViewAllocateWithoutInitializing("v_3"), team_range*team_size, thread_range, vector_range);
+
+  double result_computed = 0.0;
+  double result_expect = 0.0;
+  double time = 0.0;
+
+  if(schedule==1) {
+    if ( test_type != 500 ) {
+      // warmup - no repeat of loops
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+    else {
+      // parallel_scan: initialize 1d view for parallel_scan
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+  }
+  if(schedule==2) {
+    if ( test_type != 500 ) {
+      // warmup - no repeat of loops
+      test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+    else {
+      // parallel_scan: initialize 1d view for parallel_scan
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+  }
+
+  if ( disable_verbose_output == 0 ) {
+    printf("%7i %4i %2i %9i %4i %4i %4i %2i %1i %3i %e %e %lf\n",team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,schedule,test_type,result_computed,result_expect,time);
+  }
+  else {
+    printf("%lf\n",time);
+  }
+
+  Kokkos::finalize();
+
+  return 0;
+}
diff --git a/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp b/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ab437928de761bed614efc8a22d8c4d38fcf38f
--- /dev/null
+++ b/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
@@ -0,0 +1,355 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+template < class ViewType >
+struct ParallelScanFunctor {
+  using value_type = double;
+  ViewType v;
+
+  ParallelScanFunctor( const ViewType & v_ )
+    : v(v_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+    void operator()( const int idx, value_type& val, const bool& final ) const
+    {
+      // inclusive scan
+      val += v(idx);
+      if ( final ) {
+        v(idx) = val;
+      }
+    }
+};
+
+template<class ScheduleType,class IndexType,class ViewType1, class ViewType2, class ViewType3>
+void test_policy(int team_range, int thread_range, int vector_range,
+          int outer_repeat, int thread_repeat, int inner_repeat,
+          int team_size, int vector_size, int test_type,
+          ViewType1 &v1, ViewType2 &v2, ViewType3 &v3,
+          double &result, double &result_expect, double &time) {
+
+  typedef Kokkos::TeamPolicy<ScheduleType,IndexType> t_policy;
+  typedef typename t_policy::member_type t_team;
+  Kokkos::Timer timer;
+
+  for(int orep = 0; orep<outer_repeat; orep++) {
+
+    if (test_type == 100) {
+      Kokkos::parallel_for("100 outer for", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          v1(idx) = idx;
+          // prevent compiler optimizing loop away
+      });
+    }
+
+    if (test_type == 110) {
+      Kokkos::parallel_for("110 outer for", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            // Each team launches a parallel_for; thread_range is partitioned among team members
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+              v2( idx, t ) = t;
+              // prevent compiler optimizing loop away
+            });
+          }
+      });
+    }
+    if (test_type == 111) {
+      Kokkos::parallel_for("111 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            // Each team launches a parallel_for; thread_range is partitioned among team members
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+              for (int vr = 0; vr<inner_repeat; ++vr)
+                Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
+                  v3( idx, t, vi ) = vi;
+                  // prevent compiler optimizing loop away
+                });
+            });
+          }
+      });
+    }
+    if (test_type == 112) {
+      Kokkos::parallel_for("112 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            // Each team launches a parallel_for; thread_range is partitioned among team members
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+              double vector_result = 0.0;
+              for (int vr = 0; vr<inner_repeat; ++vr) {
+                vector_result = 0.0;
+                Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
+                  vval += 1;
+                }, vector_result);
+              }
+              v2( idx, t ) = vector_result;
+              // prevent compiler optimizing loop away
+            });
+          }
+      });
+    }
+    if (test_type == 120) {
+      Kokkos::parallel_for("120 outer for", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          double team_result = 0.0;
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            team_result = 0.0;
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
+              lval += 1;
+            }, team_result);
+          }
+          v1(idx) = team_result;
+          // prevent compiler optimizing loop away
+      });
+    }
+    if (test_type == 121) {
+      Kokkos::parallel_for("121 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          double team_result = 0.0;
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            team_result = 0.0;
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
+              lval += 1;
+              for (int vr = 0; vr<inner_repeat; ++vr) {
+                Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
+                  v3( idx, t, vi ) = vi;
+                  // prevent compiler optimizing loop away
+                });
+              }
+            }, team_result);
+          }
+          v3( idx, 0, 0 ) = team_result;
+          // prevent compiler optimizing loop away
+      });
+    }
+    if (test_type == 122) {
+      Kokkos::parallel_for("122 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          double team_result = 0.0;
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
+              double vector_result = 0.0;
+              for (int vr = 0; vr<inner_repeat; ++vr) {
+                vector_result = 0.0;
+                Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
+                  vval += 1;
+                }, vector_result);
+                lval += vector_result;
+              }
+            }, team_result);
+          }
+          v1(idx) = team_result;
+          // prevent compiler optimizing loop away
+      });
+    }
+    if (test_type == 200) {
+      Kokkos::parallel_reduce("200 outer reduce", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+          lval+=team.team_size()*team.league_rank() + team.team_rank();
+      },result);
+      result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1);
+      // sum ( seq( [0, team_range*team_size) )
+    }
+    if (test_type == 210) {
+      Kokkos::parallel_reduce("210 outer reduce", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double thread_for = 1.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+            v2(idx,t) = t;
+            // prevent compiler optimizing loop away
+          });
+        }
+        lval+=(team.team_size()*team.league_rank() + team.team_rank() + thread_for);
+      },result);
+      result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
+      // sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
+    }
+    if (test_type == 211) {
+      Kokkos::parallel_reduce("211 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double thread_for = 1.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+            for (int vr = 0; vr<inner_repeat; ++vr)
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
+                v3(idx, t, vi) = vi;
+                // prevent compiler optimizing loop away
+              });
+          });
+        }
+        lval+=idx+thread_for;
+      },result);
+      result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
+      // sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
+    }
+    if (test_type == 212) {
+      Kokkos::parallel_reduce("212 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double vector_result = 0.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          // This parallel_for is executed by each team; the thread_range is partitioned among the team members
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+            v2(idx,t) = t;
+            // prevent compiler optimizing loop away
+            for (int vr = 0; vr<inner_repeat; ++vr) {
+              vector_result = 0.0;
+              Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double &vval) {
+                vval += vi;
+              }, vector_result );
+            }
+          });
+        }
+        lval+= idx + vector_result;
+      },result);
+      result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (0.5*vector_range*(vector_range-1)*team_range*team_size);
+      // sum ( seq( [0, team_range*team_size) + sum( seq( [0, vector_range) ) per team_member (total of team_range*team_size) )
+    }
+    if (test_type == 220) {
+      Kokkos::parallel_reduce("220 outer reduce", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        double team_result = 0.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
+            tval += t;
+          },team_result);
+        }
+        lval+=team_result*team.league_rank(); // constant * league_rank
+      },result);
+      result_expect = 0.5*(team_range)*(team_range-1) * team_size * 0.5*(thread_range)*(thread_range-1);
+      // sum ( seq( [0, team_range) * constant ); constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
+    }
+    if (test_type == 221) {
+      Kokkos::parallel_reduce("221 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double team_result = 0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
+            double vector_for = 1.0;
+            for (int vr = 0; vr<inner_repeat; ++vr) {
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
+                v3(idx, t, vi) = vi;
+                // prevent compiler optimizing loop away
+              });
+            }
+            tval += t + vector_for;
+          },team_result);
+        }
+        lval+=team_result*team.league_rank();
+      },result);
+      result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range);
+      // sum ( seq( [0, team_range) * constant ) + 1 per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
+    }
+    if (test_type == 222) {
+      Kokkos::parallel_reduce("222 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        double team_result = 0.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
+            double vector_result = 0.0;
+            for (int vr = 0; vr<inner_repeat; ++vr) {
+              Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double& vval) {
+                vval += vi;
+              }, vector_result);
+            }
+            tval += t + vector_result;
+          },team_result);
+        }
+        lval+=team_result*team.league_rank();
+      },result);
+      result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range*0.5*(vector_range)*(vector_range-1));
+      // sum ( seq( [0, team_range) * constant ) + 1 + sum( seq([0,vector_range) ) per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
+    }
+
+    // parallel_for RangePolicy: range = team_size*team_range
+    if (test_type == 300) {
+      Kokkos::parallel_for("300 outer for", team_size*team_range,
+        KOKKOS_LAMBDA (const int idx) {
+          v1(idx) = idx;
+          // prevent compiler from optimizing away the loop
+      });
+    }
+    // parallel_reduce RangePolicy: range = team_size*team_range
+    if (test_type == 400) {
+      Kokkos::parallel_reduce("400 outer reduce", team_size*team_range,
+        KOKKOS_LAMBDA (const int idx, double& val) {
+          val += idx;
+      }, result);
+      result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
+    }
+    // parallel_scan RangePolicy: range = team_size*team_range
+    if (test_type == 500) {
+      Kokkos::parallel_scan("500 outer scan", team_size*team_range,
+        ParallelScanFunctor<ViewType1>(v1)
+#if 0
+        // This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation
+        KOKKOS_LAMBDA (const int idx, double& val, const bool& final) {
+          // inclusive scan
+          val += v1(idx);
+          if ( final ) {
+            v1(idx) = val;
+          }
+        }
+#endif
+      );
+      // result = v1( team_size*team_range - 1 ); // won't work with Cuda - need to copy result back to host to print
+      // result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
+    }
+
+  } // end outer for loop
+
+  time = timer.seconds();
+} //end test_policy
diff --git a/packages/kokkos/benchmarks/policy_performance/script_basic_testing.sh b/packages/kokkos/benchmarks/policy_performance/script_basic_testing.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e621fffbd435bcbdedfc3244250a0330f83bb928
--- /dev/null
+++ b/packages/kokkos/benchmarks/policy_performance/script_basic_testing.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Script to check policy_perf_test code works with each possible combo of options
+
+echo "Performance test results for parallel_reduce code computing sum of sequence [0,N) with various (nested) policies"
+
+EXECUTABLE=policy_performance
+
+TEAMRANGE=1000
+THREADRANGE=4
+VECTORRANGE=32
+TEAMSIZE=4
+VECTORSIZE=1
+OREPEAT=1
+MREPEAT=1
+IREPEAT=1
+SCHEDULE=1
+
+SUFFIX=host
+if [ -e $EXECUTABLE.$SUFFIX ]
+then
+SCHEDULE=1
+echo "Host tests Static schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+
+SCHEDULE=2
+echo "Host tests Dynamic schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+fi
+
+SUFFIX=cuda
+if [ -e $EXECUTABLE.$SUFFIX ]
+then
+SCHEDULE=1
+echo "Cuda tests Static schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+
+SCHEDULE=2
+echo "Cuda tests Dynamic schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+fi
diff --git a/packages/kokkos/benchmarks/policy_performance/script_sample_usage.sh b/packages/kokkos/benchmarks/policy_performance/script_sample_usage.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f4bfb87f8fed1d89a03289754281313aa1e83eeb
--- /dev/null
+++ b/packages/kokkos/benchmarks/policy_performance/script_sample_usage.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+# Sample script for benchmarking policy performance 
+
+# Suggested enviroment variables to export prior to executing script:
+# KNL: 
+# OMP_NUM_THREADS=256 KMP_AFFINITY=compact
+# Power:
+# OMP_NUM_THREADS=64 OMP_PROC_BIND=true
+
+# Constants and Variables:
+# Vary:  TEAMSIZE, and THREADRANGE
+#  for TEAMSIZE in {1,2,4,5,8}; do
+#  for THREADRANGE in {32,41,1000}; do
+# Fixed: TEAMRANGE, VECTORRANGE, VECTORSIZE
+# System specific: Adjust REPEAT values to architecture tests are run on
+
+# Tests
+# Static SCHEDULE = 1
+# Tier 1: parallel_for + RangePolicy 300
+# Tier 2: parallel_reduce, parallel_scan + RangePolicy 400 500
+# Tier 3: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
+# Tier 4: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
+# Dynamic SCHEDULE = 2
+# Tier 5: parallel_for + RangePolicy 300
+# Tier 6: parallel_reduce, parallel_scan + RangePolicy 400 500
+# Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
+# Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
+
+# Results grouped by: 
+# 0) SCHEDULE  1) CODE (test)  2) TEAMRANGE  3) TEAMSIZE  4) THREADRANGE
+
+EXECUTABLE=policy_performance
+
+# Default defined values
+TEAMRANGE=1000
+THREADRANGE=1
+VECTORRANGE=32
+TEAMSIZE=1
+VECTORSIZE=1
+OREPEAT=1
+MREPEAT=1
+IREPEAT=1
+SCHEDULE=1
+
+# Host tests
+SUFFIX=host
+if [ -e $EXECUTABLE.$SUFFIX ]; then
+echo "Host"
+
+for SCHEDULE in {1,2}; do
+
+# Tier 1 and 2, 5 and 6
+for CODE in {300,400,500}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+    OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+    done
+done
+
+# Tier 3, 7
+for CODE in {100,110,111,112,120,121,122}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+# Tier 4, 8
+for CODE in {200,210,211,212,220,221,222}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+done # end SCHEDULE
+
+fi # end host
+
+
+# Cuda tests
+SUFFIX=cuda
+# TEAMRANGE=10000, TEAMSIZE=8 too large
+# TEAMRANGE=10000, TEAMSIZE=8, THREADRANGE=1000 too large
+if [ -e $EXECUTABLE.$SUFFIX ]; then
+echo "Cuda"
+
+for SCHEDULE in {1,2}; do
+
+# Reset defaults
+TEAMRANGE=1000
+THREADRANGE=1
+VECTORRANGE=32
+TEAMSIZE=1
+VECTORSIZE=1
+
+# Tier 1 and 2, 5 and 6
+for CODE in {300,400,500}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+    ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+    done
+done
+
+# Tier 3, 7
+for CODE in {100,110,111,112,120,121,122}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+# Tier 4, 8
+for CODE in {200,210,211,212,220,221,222}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+done # end SCHEDULE
+
+fi #end cuda
diff --git a/packages/kokkos/bin/hpcbind b/packages/kokkos/bin/hpcbind
new file mode 100755
index 0000000000000000000000000000000000000000..92f9f81ac90374dcaf4e0d77c230d2f12713f649
--- /dev/null
+++ b/packages/kokkos/bin/hpcbind
@@ -0,0 +1,614 @@
+#!/usr/bin/env bash
+
+################################################################################
+# Check if hwloc commands exist
+################################################################################
+declare -i HPCBIND_HAS_HWLOC=1
+type hwloc-bind >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-distrib >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-ls >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-calc >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-ps >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then
+  echo "hwloc not found, no process binding will occur"
+fi
+
+# Get parent cpuset
+HPCBIND_HWLOC_PARENT_CPUSET=""
+if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+  HPCBIND_HWLOC_VERSION="$(hwloc-ls --version | cut -d ' ' -f 2)"
+  MY_PID="$BASHPID"
+  HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)"
+fi
+
+################################################################################
+# Check if nvidia-smi exist
+################################################################################
+declare -i HPCBIND_HAS_NVIDIA=0
+type nvidia-smi >/dev/null 2>&1
+HPCBIND_HAS_NVIDIA=$((!$?))
+
+
+################################################################################
+# Get visible gpu
+################################################################################
+declare -i NUM_GPUS=0
+HPCBIND_VISIBLE_GPUS=""
+if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
+  NUM_GPUS=$(nvidia-smi -L | wc -l);
+  HPCBIND_HAS_NVIDIA=$((!$?))
+  if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
+    GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
+    HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
+  fi
+fi
+
+declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
+
+
+################################################################################
+# Get queue id
+# supports sbatch, bsub, aprun
+################################################################################
+HPCBIND_QUEUE_NAME=""
+declare -i HPCBIND_QUEUE_RANK=0
+declare -i HPCBIND_QUEUE_SIZE=0
+declare -i HPCBIND_QUEUE_MAPPING=0
+
+if [[ ! -z "${PMI_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="mpich"
+  HPCBIND_QUEUE_RANK=${PMI_RANK}
+  HPCBIND_QUEUE_SIZE=${PMI_SIZE}
+elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="openmpi"
+  HPCBIND_QUEUE_RANK=${OMPI_COMM_WORLD_RANK}
+  HPCBIND_QUEUE_SIZE=${OMPI_COMM_WORLD_SIZE}
+elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="mvapich2"
+  HPCBIND_QUEUE_RANK=${MV2_COMM_WORLD_RANK}
+  HPCBIND_QUEUE_SIZE=${MV2_COMM_WORLD_SIZE}
+elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="slurm"
+  HPCBIND_QUEUE_RANK=${SLURM_PROCID}
+  HPCBIND_QUEUE_SIZE=${SLURM_NPROCS}
+elif [[ ! -z "${ALPS_APP_PE}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="aprun"
+  HPCBIND_QUEUE_RANK=${ALPS_APP_PE}
+elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="bsub"
+  HPCBIND_QUEUE_RANK=${LBS_JOBINDEX}
+fi
+
+################################################################################
+# Show help
+################################################################################
+function show_help {
+  local cmd=$(basename "$0")
+  echo "Usage: ${cmd} <options> -- command ..."
+  echo "  Set the process mask, OMP environment variables and CUDA environment"
+  echo "  variables to sane values if possible. Uses hwloc and nvidia-smi if"
+  echo "  available.  Will preserve the current process binding, so it is safe"
+  echo "  to use with a queuing system or mpiexec."
+  echo ""
+  echo "Options:"
+  echo "  --no-hwloc-bind       Disable binding"
+  echo "  --proc-bind=<LOC>     Set the initial process mask for the script"
+  echo "                        LOC can be any valid location argument for"
+  echo "                        hwloc-calc  Default: all"
+  echo "  --whole-system        ${cmd} will ignore the its parent process binding"
+  echo "  --distribute=N        Distribute the current cpuset into N partitions"
+  echo "  --distribute-partition=I"
+  echo "                        Use the i'th partition (zero based)"
+  echo "  --visible-gpus=<L>    Comma separated list of gpu ids"
+  echo "                        Default: CUDA_VISIBLE_DEVICES or all gpus in"
+  echo "                        sequential order"
+  echo "  --ignore-queue        Ignore queue job id when choosing visible GPU and partition"
+  echo "  --no-gpu-mapping      Do not set CUDA_VISIBLE_DEVICES"
+  echo "  --openmp=M.m          Set env variables for the given OpenMP version"
+  echo "                        Default: 4.0"
+  echo "  --openmp-ratio=N/D    Ratio of the cpuset to use for OpenMP"
+  echo "                        Default: 1"
+  echo "  --openmp-places=<Op>  Op=threads|cores|sockets. Default: threads"
+  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
+  echo "  --force-openmp-num-threads=N"
+  echo "                        Override logic for selecting OMP_NUM_THREADS"
+  echo "  --force-openmp-proc-bind=<OP>"
+  echo "                        Override logic for selecting OMP_PROC_BIND"
+  echo "  --no-openmp-nested    Set OMP_NESTED to false"
+  echo "  --output-prefix=<P>   Save the output to files of the form"
+  echo "                        P.hpcbind.N, P.stdout.N and P.stderr.N where P is "
+  echo "                        the prefix and N is the rank (no spaces)"
+  echo "  --output-mode=<Op>    How console output should be handled."
+  echo "                        Options are all, rank0, and none.  Default: rank0" 
+  echo "  --lstopo              Show bindings in lstopo"
+  echo "  -v|--verbose          Print bindings and relevant environment variables"
+  echo "  -h|--help             Show this message"
+  echo ""
+  echo "Sample Usage:"
+  echo ""
+  echo "  Split the current process cpuset into 4 and use the 3rd partition"
+  echo "    ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
+  echo ""
+  echo "  Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus"
+  echo "  and save the output to rank specific files"
+  echo "    mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\"
+  echo "      --distribute=4 -v --output-prefix=output  -- command ..."
+  echo ""
+  echo "  Bind the process to all even cores"
+  echo "    ${cmd} --proc-bind=core:even -v -- command ..."
+  echo ""
+  echo "  Bind the the even cores of socket 0 and the odd cores of socket 1"
+  echo "    ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..."
+  echo ""
+  echo "  Skip GPU 0 when mapping visible devices"
+  echo "    ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
+  echo ""
+  echo "  Display the current bindings"
+  echo "    ${cmd} --proc-bind=numa:0 -- command"
+  echo ""
+  echo "  Display the current bindings using lstopo"
+  echo "    ${cmd} --proc-bind=numa:0.core:odd --lstopo"
+  echo ""
+}
+
+
+################################################################################
+# Parse command line arguments
+################################################################################
+# Show help if no command line arguments given
+if [[ "$#" -eq 0 ]]; then
+  show_help
+  exit 0
+fi
+
+declare -a UNKNOWN_ARGS=()
+declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
+declare -i HPCBIND_DISTRIBUTE=1
+declare -i HPCBIND_PARTITION=-1
+HPCBIND_PROC_BIND="all"
+HPCBIND_OPENMP_VERSION=4.0
+declare -i HPCBIND_OPENMP_RATIO_NUMERATOR=1
+declare -i HPCBIND_OPENMP_RATIO_DENOMINATOR=1
+HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
+declare -i HPCBIND_OPENMP_PROC_BIND=1
+HPCBIND_OPENMP_FORCE_NUM_THREADS=""
+HPCBIND_OPENMP_FORCE_PROC_BIND=""
+declare -i HPCBIND_OPENMP_NESTED=1
+declare -i HPCBIND_VERBOSE=0
+
+declare -i HPCBIND_LSTOPO=0
+
+HPCBIND_OUTPUT_PREFIX=""
+HPCBIND_OUTPUT_MODE="rank0"
+
+declare -i HPCBIND_HAS_COMMAND=0
+
+for i in "$@"; do
+  case "$i" in
+    # number of partitions to create
+    --no-hwloc-bind)
+      HPCBIND_ENABLE_HWLOC_BIND=0
+      shift
+      ;;
+    --proc-bind=*)
+      HPCBIND_PROC_BIND="${i#*=}"
+      shift
+      ;;
+    --whole-system)
+      HPCBIND_HWLOC_PARENT_CPUSET=""
+      shift
+      ;;
+    --distribute=*)
+      HPCBIND_DISTRIBUTE="${i#*=}"
+      if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
+        HPCBIND_DISTRIBUTE=1
+      fi
+      shift
+      ;;
+    # which partition to use
+    --distribute-partition=*)
+      HPCBIND_PARTITION="${i#*=}"
+      shift
+      ;;
+    --visible-gpus=*)
+      HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
+      shift
+      ;;
+    --ignore-queue)
+      HPCBIND_QUEUE_MAPPING=0
+      shift
+      ;;
+    --no-gpu-mapping)
+      HPCBIND_ENABLE_GPU_MAPPING=0
+      shift
+      ;;
+    --openmp=*)
+      HPCBIND_OPENMP_VERSION="${i#*=}"
+      shift
+      ;;
+    --openmp-ratio=*)
+      IFS=/ read HPCBIND_OPENMP_RATIO_NUMERATOR HPCBIND_OPENMP_RATIO_DENOMINATOR <<< "${i#*=}"
+      if [[ ${HPCBIND_OPENMP_RATIO_NUMERATOR} -le 0 ]]; then
+        HPCBIND_OPENMP_RATIO_NUMERATOR=1
+      fi
+      if [[ ${HPCBIND_OPENMP_RATIO_DENOMINATOR} -le 0 ]]; then
+        HPCBIND_OPENMP_RATIO_DENOMINATOR=1
+      fi
+      if [[ ${HPCBIND_OPENMP_RATIO_NUMERATOR} -gt ${HPCBIND_OPENMP_RATIO_DENOMINATOR} ]]; then
+        HPCBIND_OPENMP_RATIO_NUMERATOR=1
+        HPCBIND_OPENMP_RATIO_DENOMINATOR=1
+      fi
+      shift
+      ;;
+    --openmp-places=*)
+      HPCBIND_OPENMP_PLACES="${i#*=}"
+      shift
+      ;;
+    --no-openmp-proc-bind)
+      HPCBIND_OPENMP_PROC_BIND=0
+      shift
+      ;;
+    --force-openmp-proc-bind=*)
+      HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}"
+      shift
+      ;;
+    --force-openmp-num-threads=*)
+      HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}"
+      shift
+      ;;
+    --no-openmp-nested)
+      HPCBIND_OPENMP_NESTED=0
+      shift
+      ;;
+    --output-prefix=*)
+      HPCBIND_OUTPUT_PREFIX="${i#*=}"
+      shift
+      ;;
+    --output-mode=*)
+      HPCBIND_OUTPUT_MODE="${i#*=}"
+      #convert to lower case
+      HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}"
+      shift
+      ;;
+    --lstopo)
+      HPCBIND_VERBOSE=1
+      HPCBIND_LSTOPO=1
+      shift
+      ;;
+    -v|--verbose)
+      HPCBIND_VERBOSE=1
+      shift
+      ;;
+    -h|--help)
+      show_help
+      exit 0
+      ;;
+    # ignore remaining arguments
+    --)
+      HPCBIND_HAS_COMMAND=1
+      shift
+      break
+      ;;
+    # unknown option
+    *)
+      UNKNOWN_ARGS+=("$i")
+      shift
+      ;;
+  esac
+done
+
+################################################################################
+# Check output mode
+################################################################################
+declare -i HPCBIND_TEE=0
+
+if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then
+  HPCBIND_TEE=0
+elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then
+  HPCBIND_TEE=1
+elif [[ ${HPCBIND_QUEUE_RANK} -eq 0 ]]; then
+  #default to rank0 printing to screen
+  HPCBIND_TEE=1
+fi
+
+
+if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
+  HPCBIND_LOG=/dev/null
+  HPCBIND_ERR=/dev/null
+  HPCBIND_OUT=/dev/null
+else
+  if [[ ${HPCBIND_QUEUE_SIZE} -gt 0 ]]; then
+    HPCBIND_STR_QUEUE_SIZE="${HPCBIND_QUEUE_SIZE}"
+    HPCBIND_STR_QUEUE_RANK=$(printf %0*d ${#HPCBIND_STR_QUEUE_SIZE} ${HPCBIND_QUEUE_RANK})
+
+    HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_STR_QUEUE_RANK}"
+    HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_STR_QUEUE_RANK}"
+    HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_STR_QUEUE_RANK}"
+  else
+    HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_QUEUE_RANK}"
+    HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_QUEUE_RANK}"
+    HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_QUEUE_RANK}"
+  fi
+  > ${HPCBIND_LOG}
+fi
+
+
+################################################################################
+# Check unknown arguments
+################################################################################
+if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
+  echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
+  exit 1
+fi
+
+################################################################################
+# Check that visible gpus are valid
+################################################################################
+HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS})
+if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
+  for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
+    if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
+      ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
+      echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG})
+      HPCBIND_VISIBLE_GPUS[$i]=0;
+    fi
+  done
+  NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
+fi
+
+
+################################################################################
+#choose the correct partition
+################################################################################
+if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then
+  HPCBIND_PARTITION=${HPCBIND_QUEUE_RANK}
+elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then
+  HPCBIND_PARTITION=0
+fi
+
+if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
+  HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE))
+fi
+
+################################################################################
+# Find cpuset and num threads
+################################################################################
+HPCBIND_HWLOC_CPUSET=""
+declare -i HPCBIND_NUM_PUS=0
+
+if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+  if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
+    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]})
+  else
+    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]})
+  fi
+
+  if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then
+    CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
+    HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}"
+  else
+    HPCBIND_HWLOC_CPUSET="${BINDING}"
+  fi
+  HPCBIND_NUM_PUS=$(hwloc-calc -q -N pu ${HPCBIND_HWLOC_CPUSET} )
+  if [ $? -ne 0 ]; then
+    HPCBIND_NUM_PUS=1
+  fi
+  HPCBIND_NUM_CORES=$(hwloc-calc -q -N core ${HPCBIND_HWLOC_CPUSET} )
+  if [ $? -ne 0 ]; then
+    HPCBIND_NUM_CORES=1
+  fi
+  HPCBIND_NUM_NUMAS=$(hwloc-calc -q -N numa ${HPCBIND_HWLOC_CPUSET} )
+  if [ $? -ne 0 ]; then
+    HPCBIND_NUM_NUMAS=1
+  fi
+  HPCBIND_NUM_SOCKETS=$(hwloc-calc -q -N socket ${HPCBIND_HWLOC_CPUSET} )
+  if [ $? -ne 0 ]; then
+    HPCBIND_NUM_SOCKETS=1
+  fi
+else
+  HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
+  HPCBIND_NUM_CORES=${HPCBIND_NUM_PUS}
+  HPCBIND_NUM_NUMAS=1
+  HPCBIND_NUM_SOCKETS=1
+fi
+
+
+if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} != "" ]]; then
+  HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS}
+else
+  declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_RATIO_NUMERATOR / HPCBIND_OPENMP_RATIO_DENOMINATOR))
+
+  if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then
+    HPCBIND_OPENMP_NUM_THREADS=1
+  elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then
+    HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS}
+  fi
+fi
+
+################################################################################
+# Set OpenMP environment variables
+################################################################################
+
+# set OMP_NUM_THREADS
+if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then
+  export OMP_NUM_THREADS="${HPCBIND_OPENMP_NUM_THREADS},1"
+else
+  export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS}
+fi
+
+# set OMP_PROC_BIND and OMP_PLACES
+if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then
+  if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then
+    #default proc bind logic
+    if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then
+      export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
+      if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then
+        export OMP_PROC_BIND="spread,spread"
+      else
+        export OMP_PROC_BIND="spread"
+      fi
+    else
+      export OMP_PROC_BIND="true"
+      unset OMP_PLACES
+    fi
+  else
+    #force proc bind
+    export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
+    export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}"
+  fi
+else
+  # no openmp proc bind
+  unset OMP_PLACES
+  unset OMP_PROC_BIND
+fi
+
+# set up hot teams (intel specific)
+if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then
+  export OMP_NESTED="true"
+  export OMP_MAX_ACTIVE_LEVELS=2
+  export KMP_HOT_TEAMS=1
+  export KMP_HOT_TEAMS_MAX_LEVEL=2
+else
+  export OMP_NESTED="false"
+fi
+
+# set OMP_NESTED
+
+################################################################################
+# Set CUDA environment variables
+################################################################################
+
+if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
+  if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
+    declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
+    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
+  else
+    declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
+    declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
+    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
+  fi
+fi
+
+################################################################################
+# Set hpcbind environment variables
+################################################################################
+export HPCBIND_HWLOC_VERSION=${HPCBIND_HWLOC_VERSION}
+export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
+export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
+export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
+export HPCBIND_NUM_CORES=${HPCBIND_NUM_CORES}
+export HPCBIND_NUM_NUMAS=${HPCBIND_NUM_NUMAS}
+export HPCBIND_NUM_SOCKETS=${HPCBIND_NUM_SOCKETS}
+export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}"
+export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
+export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
+export HPCBIND_OPENMP_RATIO="${HPCBIND_OPENMP_RATIO_NUMERATOR}/${HPCBIND_OPENMP_RATIO_DENOMINATOR}"
+if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
+  export HPCBIND_HWLOC_PARENT_CPUSET="all"
+else
+  export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
+fi
+export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
+export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
+export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
+export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
+if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
+  export HPCBIND_QUEUE_RANK=${HPCBIND_QUEUE_RANK}
+  export HPCBIND_QUEUE_SIZE=${HPCBIND_QUEUE_SIZE}
+  export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}"
+  export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING}
+fi
+
+
+################################################################################
+# Print verbose
+################################################################################
+
+TMP_ENV=$(env | sort)
+if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
+  echo "[HOST]" >> ${HPCBIND_LOG}
+  hostname -s >> ${HPCBIND_LOG}
+  echo "[HPCBIND]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
+  echo "[CUDA]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
+  echo "[OPENMP]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}
+  echo "[GOMP] (gcc, g++, and gfortran)" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^GOMP_" >> ${HPCBIND_LOG}
+  echo "[KMP] (icc, icpc, and ifort)" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^KMP_" >> ${HPCBIND_LOG}
+  echo "[XLSMPOPTS] (xlc, xlc++, and xlf)" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^XLSMPOPTS" >> ${HPCBIND_LOG}
+
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    echo "[BINDINGS]" >> ${HPCBIND_LOG}
+    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" >> ${HPCBIND_LOG}
+  else
+    echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG}
+  fi
+else
+  echo "[HOST]" > >(tee -a ${HPCBIND_LOG})
+  hostname -s > >(tee -a ${HPCBIND_LOG})
+  echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
+  echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
+  echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
+  echo "[GOMP] (gcc, g++, and gfortran)" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^GOMP_" > >(tee -a ${HPCBIND_LOG})
+  echo "[KMP] (icc, icpc, and ifort)" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^KMP_" > >(tee -a ${HPCBIND_LOG})
+  echo "[XLSMPOPTS] (xlc, xlc++, and xlf)" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^XLSMPOPTS" > >(tee -a ${HPCBIND_LOG})
+
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG})
+    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --no-io --no-bridges > >(tee -a ${HPCBIND_LOG})
+  else
+    echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG})
+  fi
+fi
+
+################################################################################
+# Run command
+################################################################################
+
+# must be the last executed command so that the return value is correct
+if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
+  hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0
+elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
+  # clear output files
+  > ${HPCBIND_ERR}
+  > ${HPCBIND_OUT}
+  if [[ ${HPCBIND_TEE} -eq 0 ]]; then
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+    else
+      eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+    fi
+  else
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+    else
+      eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+    fi
+  fi
+fi
diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper
new file mode 100755
index 0000000000000000000000000000000000000000..d339da4fcdfa8029af85073adc068a8686253b8c
--- /dev/null
+++ b/packages/kokkos/bin/nvcc_wrapper
@@ -0,0 +1,340 @@
+#!/bin/bash
+#
+# This shell script (nvcc_wrapper) wraps both the host compiler and
+# NVCC, if you are building legacy C or C++ code with CUDA enabled.
+# The script remedies some differences between the interface of NVCC
+# and that of the host compiler, in particular for linking.
+# It also means that a legacy code doesn't need separate .cu files;
+# it can just use .cpp files.
+#
+# Default settings: change those according to your machine.  For
+# example, you may have have two different wrappers with either icpc
+# or g++ as their back-end compiler.  The defaults can be overwritten
+# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc).
+
+default_arch="sm_35"
+#default_arch="sm_50"
+
+#
+# The default C++ compiler.
+#
+host_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"}
+#host_compiler="icpc"
+#host_compiler="/usr/local/gcc/4.8.3/bin/g++"
+#host_compiler="/usr/local/gcc/4.9.1/bin/g++"
+
+#
+# Internal variables
+#
+
+# C++ files
+cpp_files=""
+
+# Host compiler arguments
+xcompiler_args=""
+
+# Cuda (NVCC) only arguments
+cuda_args=""
+
+# Arguments for both NVCC and Host compiler
+shared_args=""
+
+# Argument -c
+compile_arg=""
+
+# Argument -o <obj>
+output_arg=""
+
+# Linker arguments
+xlinker_args=""
+
+# Object files passable to NVCC
+object_files=""
+
+# Link objects for the host linker only
+object_files_xlinker=""
+
+# Shared libraries with version numbers are not handled correctly by NVCC
+shared_versioned_libraries_host=""
+shared_versioned_libraries=""
+
+# Does the User set the architecture 
+arch_set=0
+
+# Does the user overwrite the host compiler
+ccbin_set=0
+
+#Error code of compilation
+error_code=0
+
+# Do a dry run without actually compiling
+dry_run=0
+
+# Skip NVCC compilation and use host compiler directly
+host_only=0
+host_only_args=""
+
+# Enable workaround for CUDA 6.5 for pragma ident 
+replace_pragma_ident=0
+
+# Mark first host compiler argument
+first_xcompiler_arg=1
+
+temp_dir=${TMPDIR:-/tmp}
+
+# Check if we have an optimization argument already
+optimization_applied=0
+
+# Check if we have -std=c++X  or --std=c++X already
+stdcxx_applied=0
+
+# Run nvcc a second time to generate dependencies if needed
+depfile_separate=0
+depfile_output_arg=""
+depfile_target_arg=""
+
+#echo "Arguments: $# $@"
+
+while [ $# -gt 0 ]
+do
+  case $1 in
+  #show the executed command
+  --show|--nvcc-wrapper-show)
+    dry_run=1
+    ;;
+  #run host compilation only
+  --host-only)
+    host_only=1
+    ;;
+  #replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros
+  --replace-pragma-ident)
+    replace_pragma_ident=1
+    ;;
+  #handle source files to be compiled as cuda files
+  *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
+    cpp_files="$cpp_files $1"
+    ;;
+   # Ensure we only have one optimization flag because NVCC doesn't allow muliple
+  -O*)
+    if [ $optimization_applied -eq 1 ]; then
+       echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting."
+    else
+       shared_args="$shared_args $1"
+       optimization_applied=1
+    fi
+    ;;
+  #Handle shared args (valid for both nvcc and the host compiler)
+  -D*|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
+    shared_args="$shared_args $1"
+    ;;
+  #Handle compilation argument
+  -c)
+    compile_arg="$1"
+    ;;
+  #Handle output argument
+  -o)
+    output_arg="$output_arg $1 $2"
+    shift
+    ;;
+  # Handle depfile arguments.  We map them to a separate call to nvcc.
+  -MD|-MMD)
+    depfile_separate=1
+    host_only_args="$host_only_args $1"
+    ;;
+  -MF)
+    depfile_output_arg="-o $2"
+    host_only_args="$host_only_args $1 $2"
+    shift
+    ;;
+  -MT)
+    depfile_target_arg="$1 $2"
+    host_only_args="$host_only_args $1 $2"
+    shift
+    ;;
+  #Handle known nvcc args
+  -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
+    cuda_args="$cuda_args $1"
+    ;;
+  #Handle more known nvcc args
+  --expt-extended-lambda|--expt-relaxed-constexpr)
+    cuda_args="$cuda_args $1"
+    ;;
+  #Handle known nvcc args that have an argument
+  -rdc|-maxrregcount|--default-stream)
+    cuda_args="$cuda_args $1 $2"
+    shift
+    ;;
+  #Handle c++11
+  --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
+    if [ $stdcxx_applied -eq 1 ]; then
+       echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
+    else
+       shared_args="$shared_args $1"
+       stdcxx_applied=1
+    fi
+    ;;
+
+  #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
+  -std=c++98|--std=c++98)
+    ;;
+  #strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
+  -pedantic|-Wpedantic|-ansi)
+    ;;
+  #strip of -Woverloaded-virtual to avoid "cc1: warning: command line option ‘-Woverloaded-virtual’ is valid for C++/ObjC++ but not for C"
+  -Woverloaded-virtual)
+    ;;
+  #strip -Xcompiler because we add it
+  -Xcompiler)
+    if [ $first_xcompiler_arg -eq 1 ]; then
+      xcompiler_args="$2"
+      first_xcompiler_arg=0
+    else
+      xcompiler_args="$xcompiler_args,$2"
+    fi
+    shift
+    ;;
+  #strip of "-x cu" because we add that
+  -x)
+    if [[ $2 != "cu" ]]; then
+      if [ $first_xcompiler_arg -eq 1 ]; then
+        xcompiler_args="-x,$2"
+        first_xcompiler_arg=0
+      else
+        xcompiler_args="$xcompiler_args,-x,$2"
+      fi
+    fi
+    shift
+    ;;
+  #Handle -ccbin (if its not set we can set it to a default value)
+  -ccbin)
+    cuda_args="$cuda_args $1 $2"
+    ccbin_set=1
+    host_compiler=$2
+    shift
+    ;;
+  #Handle -arch argument (if its not set use a default
+  -arch*)
+    cuda_args="$cuda_args $1"
+    arch_set=1
+    ;;
+  #Handle -Xcudafe argument
+  -Xcudafe)
+    cuda_args="$cuda_args -Xcudafe $2"
+    shift
+    ;;
+  #Handle args that should be sent to the linker
+  -Wl*)
+    xlinker_args="$xlinker_args -Xlinker ${1:4:${#1}}"
+    host_linker_args="$host_linker_args ${1:4:${#1}}"
+    ;;
+  #Handle object files: -x cu applies to all input files, so give them to linker, except if only linking
+  *.a|*.so|*.o|*.obj)
+    object_files="$object_files $1"
+    object_files_xlinker="$object_files_xlinker -Xlinker $1"
+    ;;
+  #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
+  @*|*.dylib)
+    object_files="$object_files -Xlinker $1"
+    object_files_xlinker="$object_files_xlinker -Xlinker $1"
+    ;;
+  #Handle shared libraries with *.so.* names which nvcc can't do.
+  *.so.*)
+    shared_versioned_libraries_host="$shared_versioned_libraries_host $1"
+    shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1"
+  ;;
+  #All other args are sent to the host compiler
+  *)
+    if [ $first_xcompiler_arg -eq 1 ]; then
+      xcompiler_args=$1
+      first_xcompiler_arg=0
+    else 
+      xcompiler_args="$xcompiler_args,$1"
+    fi
+    ;;
+  esac
+
+  shift
+done
+
+#Add default host compiler if necessary
+if [ $ccbin_set -ne 1 ]; then
+  cuda_args="$cuda_args -ccbin $host_compiler"
+fi
+
+#Add architecture command
+if [ $arch_set -ne 1 ]; then
+  cuda_args="$cuda_args -arch=$default_arch"
+fi
+
+#Compose compilation command
+nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
+if [ $first_xcompiler_arg -eq 0 ]; then
+  nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
+fi
+
+#Compose host only command
+host_command="$host_compiler $shared_args $host_only_args $compile_arg $output_arg $xcompiler_args $host_linker_args $shared_versioned_libraries_host"
+
+#nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
+if [ $replace_pragma_ident -eq 1 ]; then
+  cpp_files2=""
+  for file in $cpp_files
+  do
+    var=`grep pragma ${file} | grep ident | grep "#"`
+    if [ "${#var}" -gt 0 ]
+    then
+      sed 's/#[\ \t]*pragma[\ \t]*ident/#ident/g' $file > $temp_dir/nvcc_wrapper_tmp_$file
+      cpp_files2="$cpp_files2 $temp_dir/nvcc_wrapper_tmp_$file"
+    else
+      cpp_files2="$cpp_files2 $file"
+    fi
+  done
+  cpp_files=$cpp_files2
+  #echo $cpp_files
+fi
+
+if [ "$cpp_files" ]; then
+  nvcc_command="$nvcc_command $object_files_xlinker -x cu $cpp_files"
+else
+  nvcc_command="$nvcc_command $object_files"
+fi
+
+if [ "$cpp_files" ]; then
+  host_command="$host_command $object_files $cpp_files"
+else
+  host_command="$host_command $object_files"
+fi
+
+if [ $depfile_separate -eq 1 ]; then
+  # run nvcc a second time to generate dependencies (without compiling)
+  nvcc_depfile_command="$nvcc_command -M $depfile_target_arg $depfile_output_arg"
+else
+  nvcc_depfile_command=""
+fi
+
+nvcc_command="$nvcc_command $compile_arg $output_arg"
+
+#Print command for dryrun
+if [ $dry_run -eq 1 ]; then
+  if [ $host_only -eq 1 ]; then
+    echo $host_command
+  elif [ -n "$nvcc_depfile_command" ]; then
+    echo $nvcc_command "&&" $nvcc_depfile_command
+  else
+    echo $nvcc_command
+  fi
+  exit 0
+fi
+
+#Run compilation command
+if [ $host_only -eq 1 ]; then
+  $host_command
+elif [ -n "$nvcc_depfile_command" ]; then
+  $nvcc_command && $nvcc_depfile_command
+else
+  $nvcc_command
+fi
+error_code=$?
+
+#Report error code
+exit $error_code
diff --git a/packages/kokkos/bin/runtest b/packages/kokkos/bin/runtest
new file mode 100755
index 0000000000000000000000000000000000000000..92411fe5badf5398b3e2cee325161f225d98f33a
--- /dev/null
+++ b/packages/kokkos/bin/runtest
@@ -0,0 +1,165 @@
+#!/usr/bin/env bash
+
+function get_path() {
+  cd "$(dirname "$0")"
+  cd ..
+  echo "$(pwd -P)"
+}
+
+KOKKOS_PATH="$(get_path "$0")"
+
+function show_help() {
+  local cmd=$(basename "$0")
+  echo "Usage: ${cmd} <options> "
+  echo "  Build and run the tests"
+  echo ""
+  echo "Options:"
+  echo "  -j=N|--make-j=N        Build the tests in parallel"
+  echo "  -c|--clean             Clean build and regenerate make files"
+  echo "  --clean-on-pass        Clean build when runtest passes"
+  echo "  --output-prefix=<pre>  Prefix of log files  Default: runtest"
+  echo "  --build-only           Only build the tests"
+  echo "  -v|--verbose           Tee STDOUT and STDERR to screen and files"
+  echo "  -h|--help              Show this message"
+  echo ""
+  ${KOKKOS_PATH}/generate_makefile.bash --help
+  return 0
+}
+
+
+declare -a GENERATE_ARGS=()
+declare -i VERBOSE=0
+declare -i CLEAN=0
+declare -i CLEAN_ON_PASS=0
+declare -i BUILD_ONLY=0
+OUTPUT="runtest"
+
+declare -i MAKE_J=${HPCBIND_NUM_PUS:-1}
+
+for i in $@; do
+  case $i in
+    -j=*|--make-j=*)
+      MAKE_J=${i#*=}
+      shift
+      ;;
+    -c|--clean)
+      CLEAN=1
+      shift
+      ;;
+    --clean-on-pass)
+      CLEAN_ON_PASS=1
+      shift
+      ;;
+    --output-prefix=*)
+      OUTPUT=${i#*=}
+      shift
+      ;;
+    --build-only)
+      BUILD_ONLY=1
+      shift
+      ;;
+    -v|--verbose)
+      VERBOSE=1
+      shift
+      ;;
+    -h|--help)
+      show_help
+      exit 0
+      ;;
+    *)
+      GENERATE_ARGS+=("$i")
+      shift
+      ;;
+  esac
+done
+
+if [[ "$(pwd -P)" == ${KOKKOS_PATH} ]]; then
+  echo "Cannot call $0 from root repository path ${KOKKOS_PATH}"
+  exit 1
+fi
+
+# Some makefile dependencies are incorrect, so clean needs to force
+# a new call to generate_makefiles.bash
+if [[ ${CLEAN} -eq 1 ]]; then
+  START=${SECONDS}
+  echo "Cleaning"
+  /bin/rm -rf algorithms containers core example install Makefile >/dev/null 2>&1
+  END=${SECONDS}
+  echo "    $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+fi
+
+declare -i START=${SECONDS}
+echo "Generating Makefile"
+echo "    ${KOKKOS_PATH}/generate_makefile.bash --kokkos-path=${KOKKOS_PATH} ${GENERATE_ARGS[@]}"
+
+if [[ ${VERBOSE} -eq 0 ]]; then
+  "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > ${OUTPUT}.out 2> >(tee ${OUTPUT}.err >&2)
+else
+  "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > >(tee ${OUTPUT}.out) 2> >(tee ${OUTPUT}.err >&2)
+fi
+declare -i RESULT=$?
+declare -i END=${SECONDS}
+if [[ ${RESULT} -eq 0 ]]; then
+  echo "    PASS:  $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+else
+  cat ${OUTPUT}.out | grep "FAIL"
+  cat ${OUTPUT}.err | grep "FAIL"
+  echo "    FAIL:  $((END-START)) seconds"
+  exit 1
+fi
+
+START=${SECONDS}
+echo "Building"
+if [[ ${VERBOSE} -eq 0 ]]; then
+  make --keep-going -j ${MAKE_J} build-test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
+else
+  make --keep-going -j ${MAKE_J} build-test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
+fi
+RESULT=$?
+END=${SECONDS}
+if [[ ${RESULT} -eq 0 ]]; then
+  echo "    PASS:  $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+else
+  cat ${OUTPUT}.out | grep -E "[[:space:]]error:[[:space:]]"
+  cat ${OUTPUT}.err | grep -E "[[:space:]]error:[[:space:]]"
+  echo "    FAIL:  $((END-START)) seconds"
+  exit 1
+fi
+
+if [[ ${BUILD_ONLY} -eq 0 ]]; then
+  START=${SECONDS}
+  echo "Testing"
+  if [[ ${VERBOSE} -eq 0 ]]; then
+    make --keep-going test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
+  else
+    make --keep-going test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
+  fi
+  RESULT=$?
+  END=${SECONDS}
+  if [[ ${RESULT} -eq 0 ]]; then
+    echo "    PASS:  $((END-START)) seconds"
+    if [[ ${CLEAN_ON_PASS} -eq 1 ]]; then
+      make clean
+    fi
+  else
+    cat ${OUTPUT}.out | grep "FAIL"
+    cat ${OUTPUT}.err | grep "FAIL"
+    echo "    FAIL:  $((END-START)) seconds"
+    exit 1
+  fi
+fi
+
+exit ${RESULT}
+
diff --git a/packages/kokkos/cmake/Dependencies.cmake b/packages/kokkos/cmake/Dependencies.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8c51eab4d78b68f9c01e64f63352a22cf8f2086d
--- /dev/null
+++ b/packages/kokkos/cmake/Dependencies.cmake
@@ -0,0 +1,10 @@
+TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
+  SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
+    #SubPackageName       Directory         Class    Req/Opt
+    #
+    # New Kokkos subpackages:
+    Core                  core              PS       REQUIRED
+    Containers            containers        PS       OPTIONAL
+    Algorithms            algorithms        PS       OPTIONAL
+    Example               example           EX       OPTIONAL
+  )
diff --git a/packages/kokkos/cmake/KokkosConfig.cmake.in b/packages/kokkos/cmake/KokkosConfig.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..fc099a494ce25af9068e113a688904e06458fcbe
--- /dev/null
+++ b/packages/kokkos/cmake/KokkosConfig.cmake.in
@@ -0,0 +1,18 @@
+# - Config file for the Kokkos package
+# It defines the following variables
+#  Kokkos_INCLUDE_DIRS - include directories for Kokkos
+#  Kokkos_LIBRARIES    - libraries to link against
+
+# Compute paths
+GET_FILENAME_COMPONENT(Kokkos_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+SET(Kokkos_INCLUDE_DIRS "@CONF_INCLUDE_DIRS@")
+
+# Our library dependencies (contains definitions for IMPORTED targets)
+IF(NOT TARGET kokkos AND NOT Kokkos_BINARY_DIR)
+  INCLUDE("${Kokkos_CMAKE_DIR}/KokkosTargets.cmake")
+ENDIF()
+
+# These are IMPORTED targets created by KokkosTargets.cmake
+SET(Kokkos_LIBRARY_DIRS @INSTALL_LIB_DIR@)
+SET(Kokkos_LIBRARIES @Kokkos_LIBRARIES_NAMES@)
+SET(Kokkos_TPL_LIBRARIES @KOKKOS_LIBS@)
diff --git a/packages/kokkos/cmake/Makefile.generate_cmake_settings b/packages/kokkos/cmake/Makefile.generate_cmake_settings
new file mode 100644
index 0000000000000000000000000000000000000000..da076b23db2697f4abb874cd5c154c08e99ad6cb
--- /dev/null
+++ b/packages/kokkos/cmake/Makefile.generate_cmake_settings
@@ -0,0 +1,8 @@
+ifndef KOKKOS_PATH
+  MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+  KOKKOS_PATH = $(subst Makefile,,$(MAKEFILE_PATH))..
+endif
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+include $(KOKKOS_PATH)/core/src/Makefile.generate_header_lists
+include $(KOKKOS_PATH)/core/src/Makefile.generate_build_files
diff --git a/packages/kokkos/cmake/Modules/FindHWLOC.cmake b/packages/kokkos/cmake/Modules/FindHWLOC.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..60df8084d80815f79f0215c17b40f80db4c791c0
--- /dev/null
+++ b/packages/kokkos/cmake/Modules/FindHWLOC.cmake
@@ -0,0 +1,20 @@
+#.rst:
+# FindHWLOC
+# ----------
+#
+# Try to find HWLOC, based on KOKKOS_HWLOC_DIR
+#
+# The following variables are defined:
+#
+#   HWLOC_FOUND - System has HWLOC
+#   HWLOC_INCLUDE_DIR - HWLOC include directory
+#   HWLOC_LIBRARIES - Libraries needed to use HWLOC
+
+find_path(HWLOC_INCLUDE_DIR hwloc.h PATHS "${KOKKOS_HWLOC_DIR}/include")
+find_library(HWLOC_LIBRARIES hwloc PATHS "${KOKKOS_HWLOC_DIR}/lib")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(HWLOC DEFAULT_MSG
+                                  HWLOC_INCLUDE_DIR HWLOC_LIBRARIES)
+
+mark_as_advanced(HWLOC_INCLUDE_DIR HWLOC_LIBRARIES)
diff --git a/packages/kokkos/cmake/Modules/FindMemkind.cmake b/packages/kokkos/cmake/Modules/FindMemkind.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..245fb44c19a2ff71a389c5f139cb3f6ac38f924e
--- /dev/null
+++ b/packages/kokkos/cmake/Modules/FindMemkind.cmake
@@ -0,0 +1,20 @@
+#.rst:
+# FindMemkind
+# ----------
+#
+# Try to find Memkind.
+#
+# The following variables are defined:
+#
+#   MEMKIND_FOUND - System has Memkind
+#   MEMKIND_INCLUDE_DIR - Memkind include directory
+#   MEMKIND_LIBRARIES - Libraries needed to use Memkind
+
+find_path(MEMKIND_INCLUDE_DIR memkind.h)
+find_library(MEMKIND_LIBRARIES memkind)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Memkind DEFAULT_MSG
+  MEMKIND_INCLUDE_DIR MEMKIND_LIBRARIES)
+
+mark_as_advanced(MEMKIND_INCLUDE_DIR MEMKIND_LIBRARIES)
diff --git a/packages/kokkos/cmake/Modules/FindQthreads.cmake b/packages/kokkos/cmake/Modules/FindQthreads.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..a254b0e996d23e01897f5f186a316c285f64e9ee
--- /dev/null
+++ b/packages/kokkos/cmake/Modules/FindQthreads.cmake
@@ -0,0 +1,20 @@
+#.rst:
+# FindQthreads
+# ----------
+#
+# Try to find Qthreads.
+#
+# The following variables are defined:
+#
+#   QTHREADS_FOUND - System has Qthreads
+#   QTHREADS_INCLUDE_DIR - Qthreads include directory
+#   QTHREADS_LIBRARIES - Libraries needed to use Qthreads
+
+find_path(QTHREADS_INCLUDE_DIR qthread.h)
+find_library(QTHREADS_LIBRARIES qthread)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Qthreads DEFAULT_MSG
+                                  QTHREADS_INCLUDE_DIR QTHREADS_LIBRARIES)
+
+mark_as_advanced(QTHREADS_INCLUDE_DIR QTHREADS_LIBRARIES)
diff --git a/packages/kokkos/cmake/deps/CUDA.cmake b/packages/kokkos/cmake/deps/CUDA.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..801c20067b9195db5ba5e6cd6fdd62a426e6e294
--- /dev/null
+++ b/packages/kokkos/cmake/deps/CUDA.cmake
@@ -0,0 +1,79 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+# Check for CUDA support
+
+SET(_CUDA_FAILURE OFF)
+
+# Have CMake find CUDA
+IF(NOT _CUDA_FAILURE)
+  FIND_PACKAGE(CUDA 3.2)
+  IF (NOT CUDA_FOUND)
+    SET(_CUDA_FAILURE ON)
+  ENDIF()
+ENDIF()
+
+IF(NOT _CUDA_FAILURE)
+  # if we haven't met failure
+  macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target)
+    TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY)
+  endmacro()
+  GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS)
+  GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
+  GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY})
+  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
+ELSE()
+  SET(TPL_ENABLE_CUDA OFF)
+ENDIF()
diff --git a/packages/kokkos/cmake/deps/CUSPARSE.cmake b/packages/kokkos/cmake/deps/CUSPARSE.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..6f26d857c09acf7bb24c2c5449a54f5d507deae8
--- /dev/null
+++ b/packages/kokkos/cmake/deps/CUSPARSE.cmake
@@ -0,0 +1,64 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+#include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
+
+#IF (TPL_ENABLE_CUDA)
+#  GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
+#  GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
+#  GLOBAL_SET(TPL_CUSPARSE_LIBRARIES    ${CUDA_cusparse_LIBRARY})
+#  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
+#ENDIF()
+
diff --git a/packages/kokkos/cmake/deps/HWLOC.cmake b/packages/kokkos/cmake/deps/HWLOC.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..275abd3a5d4ecfb3ce3b207f978959f6f9019061
--- /dev/null
+++ b/packages/kokkos/cmake/deps/HWLOC.cmake
@@ -0,0 +1,70 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  November 2011
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        http://www.open-mpi.org/projects/hwloc/
+#    Version:       1.3
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
+  REQUIRED_HEADERS hwloc.h
+  REQUIRED_LIBS_NAMES "hwloc"
+  )
diff --git a/packages/kokkos/cmake/deps/Pthread.cmake b/packages/kokkos/cmake/deps/Pthread.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..46d0a939cad0e6c5479cb20da1d37ba5ca509b8c
--- /dev/null
+++ b/packages/kokkos/cmake/deps/Pthread.cmake
@@ -0,0 +1,83 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+SET(USE_THREADS FALSE)
+
+IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
+  # Use CMake's Thread finder since it is a bit smarter in determining
+  # whether pthreads is already built into the compiler and doesn't need
+  # a library to link.
+  FIND_PACKAGE(Threads)
+  #If Threads found a copy of pthreads make sure it is one of the cases the tribits
+  #tpl system cannot handle.
+  IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
+    IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
+      SET(USE_THREADS TRUE)
+    ENDIF()
+  ENDIF()
+ENDIF()
+
+IF(USE_THREADS)
+  SET(TPL_Pthread_INCLUDE_DIRS "")
+  SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
+  SET(TPL_Pthread_LIBRARY_DIRS "")
+  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(Pthread)
+ELSE()
+  TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
+    REQUIRED_HEADERS pthread.h
+    REQUIRED_LIBS_NAMES pthread
+      )
+ENDIF()
diff --git a/packages/kokkos/cmake/deps/QTHREADS.cmake b/packages/kokkos/cmake/deps/QTHREADS.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c312f2590bcd29197a0cf3fbd5e0b484579a09c2
--- /dev/null
+++ b/packages/kokkos/cmake/deps/QTHREADS.cmake
@@ -0,0 +1,69 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  July 2014
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        https://code.google.com/p/qthreads
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
+  REQUIRED_HEADERS qthread.h
+  REQUIRED_LIBS_NAMES "qthread"
+  )
diff --git a/packages/kokkos/cmake/kokkos_build.cmake b/packages/kokkos/cmake/kokkos_build.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..76d0655adb76753d09df3bbfd88ff6d98d7da013
--- /dev/null
+++ b/packages/kokkos/cmake/kokkos_build.cmake
@@ -0,0 +1,229 @@
+############################ Detect if submodule ###############################
+#
+# With thanks to StackOverflow:  
+#      http://stackoverflow.com/questions/25199677/how-to-detect-if-current-scope-has-a-parent-in-cmake
+#
+get_directory_property(HAS_PARENT PARENT_DIRECTORY)
+if(HAS_PARENT)
+  message(STATUS "Submodule build")
+  SET(KOKKOS_HEADER_DIR "include/kokkos")
+else()
+  message(STATUS "Standalone build")
+  SET(KOKKOS_HEADER_DIR "include")
+endif()
+
+################################ Handle the actual build #######################
+
+SET(INSTALL_LIB_DIR lib CACHE PATH "Installation directory for libraries")
+SET(INSTALL_BIN_DIR bin CACHE PATH "Installation directory for executables")
+SET(INSTALL_INCLUDE_DIR ${KOKKOS_HEADER_DIR} CACHE PATH
+  "Installation directory for header files")
+IF(WIN32 AND NOT CYGWIN)
+  SET(DEF_INSTALL_CMAKE_DIR CMake)
+ELSE()
+  SET(DEF_INSTALL_CMAKE_DIR lib/CMake/Kokkos)
+ENDIF()
+
+SET(INSTALL_CMAKE_DIR ${DEF_INSTALL_CMAKE_DIR} CACHE PATH
+    "Installation directory for CMake files")
+
+# Make relative paths absolute (needed later on)
+FOREACH(p LIB BIN INCLUDE CMAKE)
+  SET(var INSTALL_${p}_DIR)
+  IF(NOT IS_ABSOLUTE "${${var}}")
+    SET(${var} "${CMAKE_INSTALL_PREFIX}/${${var}}")
+  ENDIF()
+ENDFOREACH()
+
+# set up include-directories
+SET (Kokkos_INCLUDE_DIRS
+    ${Kokkos_SOURCE_DIR}/core/src
+    ${Kokkos_SOURCE_DIR}/containers/src
+    ${Kokkos_SOURCE_DIR}/algorithms/src
+    ${Kokkos_BINARY_DIR}  # to find KokkosCore_config.h
+    ${KOKKOS_INCLUDE_DIRS}
+)
+
+# pass include dirs back to parent scope
+if(HAS_PARENT)
+SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE)
+else()
+SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS})
+endif()
+
+INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS})
+
+IF(KOKKOS_SEPARATE_LIBS)
+  # Sources come from makefile-generated kokkos_generated_settings.cmake file
+  # Separate libs need to separate the sources
+  set_kokkos_srcs(KOKKOS_SRC ${KOKKOS_SRC})
+
+  # kokkoscore
+  ADD_LIBRARY(
+    kokkoscore
+    ${KOKKOS_CORE_SRCS}
+  )
+
+  target_compile_options(
+    kokkoscore
+    PUBLIC $<$<COMPILE_LANGUAGE:CXX>:${KOKKOS_CXX_FLAGS}>
+  )
+
+  target_include_directories(
+    kokkoscore
+    PUBLIC
+    ${KOKKOS_TPL_INCLUDE_DIRS}
+  )
+
+  foreach(lib IN LISTS KOKKOS_TPL_LIBRARY_NAMES)
+    find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
+    target_link_libraries(kokkoscore PUBLIC ${LIB_${lib}})
+  endforeach()
+
+  target_link_libraries(kokkoscore PUBLIC "${KOKKOS_LINK_FLAGS}")
+
+  # Install the kokkoscore library
+  INSTALL (TARGETS kokkoscore
+           EXPORT KokkosTargets
+           ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+           LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+           RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
+  )
+
+  # kokkoscontainers
+  if (DEFINED KOKKOS_CONTAINERS_SRCS)
+    ADD_LIBRARY(
+      kokkoscontainers
+      ${KOKKOS_CONTAINERS_SRCS}
+    )
+  endif()
+
+  TARGET_LINK_LIBRARIES(
+    kokkoscontainers
+    kokkoscore
+  )
+
+  # Install the kokkocontainers library
+  INSTALL (TARGETS kokkoscontainers
+           EXPORT KokkosTargets
+           ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+           LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+           RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+
+  # kokkosalgorithms - Build as interface library since no source files.
+  ADD_LIBRARY(
+    kokkosalgorithms
+    INTERFACE
+  )
+
+  target_include_directories(
+    kokkosalgorithms
+    INTERFACE ${Kokkos_SOURCE_DIR}/algorithms/src
+  )
+
+  TARGET_LINK_LIBRARIES(
+    kokkosalgorithms
+    INTERFACE kokkoscore
+  )
+
+  # Install the kokkoalgorithms library
+  INSTALL (TARGETS kokkosalgorithms
+           ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+           LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+           RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+
+  SET (Kokkos_LIBRARIES_NAMES kokkoscore kokkoscontainers kokkosalgorithms)
+
+ELSE()
+  # kokkos
+  ADD_LIBRARY(
+    kokkos
+    ${KOKKOS_CORE_SRCS}
+    ${KOKKOS_CONTAINERS_SRCS}
+  )
+
+  target_compile_options(
+    kokkos
+    PUBLIC $<$<COMPILE_LANGUAGE:CXX>:${KOKKOS_CXX_FLAGS}>
+  )
+
+  target_include_directories(
+    kokkos
+    PUBLIC
+    ${KOKKOS_TPL_INCLUDE_DIRS}
+  )
+
+  foreach(lib IN LISTS KOKKOS_TPL_LIBRARY_NAMES)
+    find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
+    target_link_libraries(kokkos PUBLIC ${LIB_${lib}})
+  endforeach()
+
+  target_link_libraries(kokkos PUBLIC "${KOKKOS_LINK_FLAGS}")
+
+  # Install the kokkos library
+  INSTALL (TARGETS kokkos
+           EXPORT KokkosTargets
+           ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+           LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+           RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+
+
+  SET (Kokkos_LIBRARIES_NAMES kokkos)
+
+endif()  # KOKKOS_SEPARATE_LIBS
+
+# Install the kokkos headers
+INSTALL (DIRECTORY
+         EXPORT KokkosTargets
+         ${Kokkos_SOURCE_DIR}/core/src/
+         DESTINATION ${KOKKOS_HEADER_DIR}
+         FILES_MATCHING PATTERN "*.hpp"
+)
+INSTALL (DIRECTORY
+         EXPORT KokkosTargets
+         ${Kokkos_SOURCE_DIR}/containers/src/
+         DESTINATION ${KOKKOS_HEADER_DIR}
+         FILES_MATCHING PATTERN "*.hpp"
+)
+INSTALL (DIRECTORY
+         EXPORT KokkosTargets
+         ${Kokkos_SOURCE_DIR}/algorithms/src/
+         DESTINATION ${KOKKOS_HEADER_DIR}
+         FILES_MATCHING PATTERN "*.hpp"
+)
+
+INSTALL (FILES
+         ${Kokkos_BINARY_DIR}/KokkosCore_config.h
+         DESTINATION ${KOKKOS_HEADER_DIR}
+)
+
+# Add all targets to the build-tree export set
+export(TARGETS ${Kokkos_LIBRARIES_NAMES}
+  FILE "${Kokkos_BINARY_DIR}/KokkosTargets.cmake")
+
+# Export the package for use from the build-tree
+# (this registers the build-tree with a global CMake-registry)
+export(PACKAGE Kokkos)
+
+# Create the KokkosConfig.cmake and KokkosConfigVersion files
+file(RELATIVE_PATH REL_INCLUDE_DIR "${INSTALL_CMAKE_DIR}"
+   "${INSTALL_INCLUDE_DIR}")
+# ... for the build tree
+set(CONF_INCLUDE_DIRS "${Kokkos_SOURCE_DIR}" "${Kokkos_BINARY_DIR}")
+configure_file(${Kokkos_SOURCE_DIR}/cmake/KokkosConfig.cmake.in
+  "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" @ONLY)
+# ... for the install tree
+set(CONF_INCLUDE_DIRS "\${Kokkos_CMAKE_DIR}/${REL_INCLUDE_DIR}")
+configure_file(${Kokkos_SOURCE_DIR}/cmake/KokkosConfig.cmake.in
+  "${Kokkos_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/KokkosConfig.cmake" @ONLY)
+
+# Install the KokkosConfig.cmake and KokkosConfigVersion.cmake
+install(FILES
+  "${Kokkos_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/KokkosConfig.cmake"
+  DESTINATION "${INSTALL_CMAKE_DIR}")
+
+#This seems not to do anything?
+#message(STATUS "KokkosTargets: " ${KokkosTargets})
+# Install the export set for use with the install-tree
+INSTALL(EXPORT KokkosTargets DESTINATION
+       "${INSTALL_CMAKE_DIR}")
diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c0c62ccb6a6ce7989a5acfa122bcfbf4d5a34028
--- /dev/null
+++ b/packages/kokkos/cmake/kokkos_functions.cmake
@@ -0,0 +1,345 @@
+################################### FUNCTIONS ##################################
+# List of functions
+#   set_kokkos_cxx_compiler
+#   set_kokkos_cxx_standard
+#   set_kokkos_srcs
+
+#-------------------------------------------------------------------------------
+# function(set_kokkos_cxx_compiler)
+# Sets the following compiler variables that are analogous to the CMAKE_*
+# versions.  We add the ability to detect NVCC (really nvcc_wrapper).
+#   KOKKOS_CXX_COMPILER
+#   KOKKOS_CXX_COMPILER_ID
+#   KOKKOS_CXX_COMPILER_VERSION
+#
+# Inputs:
+#   KOKKOS_ENABLE_CUDA
+#   CMAKE_CXX_COMPILER
+#   CMAKE_CXX_COMPILER_ID
+#   CMAKE_CXX_COMPILER_VERSION
+#
+# Also verifies the compiler version meets the minimum required by Kokkos.
+function(set_kokkos_cxx_compiler)
+  # Since CMake doesn't recognize the nvcc compiler until 3.8, we use our own
+  # version of the CMake variables and detect nvcc ourselves.  Initially set to
+  # the CMake variable values.
+  set(INTERNAL_CXX_COMPILER ${CMAKE_CXX_COMPILER})
+  set(INTERNAL_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID})
+  set(INTERNAL_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION})
+
+  # Check if the compiler is nvcc (which really means nvcc_wrapper).
+  execute_process(COMMAND ${INTERNAL_CXX_COMPILER} --version
+                  COMMAND grep nvcc
+                  COMMAND wc -l
+                  OUTPUT_VARIABLE INTERNAL_HAVE_COMPILER_NVCC
+                  OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  string(REGEX REPLACE "^ +" ""
+         INTERNAL_HAVE_COMPILER_NVCC ${INTERNAL_HAVE_COMPILER_NVCC})
+
+  if(INTERNAL_HAVE_COMPILER_NVCC)
+    # Set the compiler id to nvcc.  We use the value used by CMake 3.8.
+    set(INTERNAL_CXX_COMPILER_ID NVIDIA)
+
+    # Set nvcc's compiler version.
+    execute_process(COMMAND ${INTERNAL_CXX_COMPILER} --version
+                    COMMAND grep release
+                    OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    string(REGEX MATCH "[0-9]+\.[0-9]+\.[0-9]+$"
+           INTERNAL_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION})
+  endif()
+
+  # Enforce the minimum compilers supported by Kokkos.
+  set(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos.  Required compiler versions:")
+  set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Clang      3.5.2 or higher")
+  set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    GCC        4.8.4 or higher")
+  set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Intel     15.0.2 or higher")
+  set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    NVCC      7.0.28 or higher")
+  set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    PGI         17.1 or higher\n")
+
+  if(INTERNAL_CXX_COMPILER_ID STREQUAL Clang)
+    if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 3.5.2)
+      message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
+    endif()
+  elseif(INTERNAL_CXX_COMPILER_ID STREQUAL GNU)
+    if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 4.8.4)
+      message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
+    endif()
+  elseif(INTERNAL_CXX_COMPILER_ID STREQUAL Intel)
+    if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 15.0.2)
+      message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
+    endif()
+  elseif(INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA)
+    if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 7.0.28)
+      message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
+    endif()
+  elseif(INTERNAL_CXX_COMPILER_ID STREQUAL PGI)
+    if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 17.1)
+      message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
+    endif()
+  endif()
+
+  # Enforce that extensions are turned off for nvcc_wrapper.
+  if(INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA)
+    if(DEFINED CMAKE_CXX_EXTENSIONS AND CMAKE_CXX_EXTENSIONS STREQUAL ON)
+      message(FATAL_ERROR "NVCC doesn't support C++ extensions.  Set CMAKE_CXX_EXTENSIONS to OFF in your CMakeLists.txt.")
+    endif()
+  endif()
+
+  if(KOKKOS_ENABLE_CUDA)
+    # Enforce that the compiler can compile CUDA code.
+    if(INTERNAL_CXX_COMPILER_ID STREQUAL Clang)
+      if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 4.0.0)
+        message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.")
+      endif()
+    elseif(NOT INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA)
+      message(FATAL_ERROR "Invalid compiler for CUDA.  The compiler must be nvcc_wrapper or Clang.")
+    endif()
+  endif()
+
+  set(KOKKOS_CXX_COMPILER ${INTERNAL_CXX_COMPILER} PARENT_SCOPE)
+  set(KOKKOS_CXX_COMPILER_ID ${INTERNAL_CXX_COMPILER_ID} PARENT_SCOPE)
+  set(KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION} PARENT_SCOPE)
+endfunction()
+
+#-------------------------------------------------------------------------------
+# function(set_kokkos_cxx_standard)
+#  Transitively enforces that the appropriate CXX standard compile flags (C++11
+#  or above) are added to targets that use the Kokkos library.  Compile features
+#  are used if possible.  Otherwise, the appropriate flags are added to
+#  KOKKOS_CXX_FLAGS.  Values set by the user to CMAKE_CXX_STANDARD and
+#  CMAKE_CXX_EXTENSIONS are honored.
+#
+# Outputs:
+#   KOKKOS_CXX11_FEATURES
+#   KOKKOS_CXX_FLAGS
+#
+# Inputs:
+#  KOKKOS_CXX_COMPILER
+#  KOKKOS_CXX_COMPILER_ID
+#  KOKKOS_CXX_COMPILER_VERSION
+#
+function(set_kokkos_cxx_standard)
+  # The following table lists the versions of CMake that supports CXX_STANDARD
+  # and the CXX compile features for different compilers.  The versions are
+  # based on CMake documentation, looking at CMake code, and verifying by
+  # testing with specific CMake versions.
+  #
+  #   COMPILER                      CXX_STANDARD     Compile Features
+  #   ---------------------------------------------------------------
+  #   Clang                             3.1                3.1
+  #   GNU                               3.1                3.2
+  #   AppleClang                        3.2                3.2
+  #   Intel                             3.6                3.6
+  #   Cray                              No                 No
+  #   PGI                               No                 No
+  #   XL                                No                 No
+  #
+  # For compiling CUDA code using nvcc_wrapper, we will use the host compiler's
+  # flags for turning on C++11.  Since for compiler ID and versioning purposes
+  # CMake recognizes the host compiler when calling nvcc_wrapper, this just
+  # works.  Both NVCC and nvcc_wrapper only recognize '-std=c++11' which means
+  # that we can only use host compilers for CUDA builds that use those flags.
+  # It also means that extensions (gnu++11) can't be turned on for CUDA builds.
+
+  # Check if we can use compile features.
+  if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
+    if(CMAKE_CXX_COMPILER_ID STREQUAL Clang)
+      if(NOT CMAKE_VERSION VERSION_LESS 3.1)
+        set(INTERNAL_USE_COMPILE_FEATURES ON)
+      endif()
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang OR CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+      if(NOT CMAKE_VERSION VERSION_LESS 3.2)
+        set(INTERNAL_USE_COMPILE_FEATURES ON)
+      endif()
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel)
+      if(NOT CMAKE_VERSION VERSION_LESS 3.6)
+        set(INTERNAL_USE_COMPILE_FEATURES ON)
+      endif()
+    endif()
+  endif()
+
+  if(INTERNAL_USE_COMPILE_FEATURES)
+    # Use the compile features aspect of CMake to transitively cause C++ flags
+    # to populate to user code.
+
+    # I'm using a hack by requiring features that I know force the lowest version
+    # of the compilers we want to support.  Clang 3.3 and later support all of
+    # the C++11 standard.  With CMake 3.8 and higher, we could switch to using
+    # cxx_std_11.
+    set(KOKKOS_CXX11_FEATURES
+        cxx_nonstatic_member_init # Forces GCC 4.7 or later and Intel 14.0 or later.
+        PARENT_SCOPE
+       )
+  else()
+    # CXX compile features are not yet implemented for this combination of
+    # compiler and version of CMake.
+
+    if(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang)
+      # Versions of CMAKE before 3.2 don't support CXX_STANDARD or C++ compile
+      # features for the AppleClang compiler.  Set compiler flags transitively
+      # here such that they trickle down to a call to target_compile_options().
+
+      # The following two blocks of code were copied from
+      # /Modules/Compiler/AppleClang-CXX.cmake from CMake 3.7.2 and then
+      # modified.
+      if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0)
+        set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "-std=c++11")
+        set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11")
+      endif()
+
+      if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.1)
+        set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-std=c++14")
+        set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14")
+      elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1)
+        # AppleClang 5.0 knows this flag, but does not set a __cplusplus macro
+        # greater than 201103L.
+        set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y")
+        set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y")
+      endif()
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel)
+      # Versions of CMAKE before 3.6 don't support CXX_STANDARD or C++ compile
+      # features for the Intel compiler.  Set compiler flags transitively here
+      # such that they trickle down to a call to target_compile_options().
+
+      # The following three blocks of code were copied from
+      # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified.
+      if("x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
+        set(_std -Qstd)
+        set(_ext c++)
+      else()
+        set(_std -std)
+        set(_ext gnu++)
+      endif()
+
+      if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.2)
+        set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "${_std}=c++14")
+        # TODO: There is no gnu++14 value supported; figure out what to do.
+        set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "${_std}=c++14")
+      elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0)
+        set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "${_std}=c++1y")
+        # TODO: There is no gnu++14 value supported; figure out what to do.
+        set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "${_std}=c++1y")
+      endif()
+
+      if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0)
+        set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "${_std}=c++11")
+        set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "${_std}=${_ext}11")
+      elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1)
+        set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "${_std}=c++0x")
+        set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "${_std}=${_ext}0x")
+      endif()
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL Cray)
+      # CMAKE doesn't support CXX_STANDARD or C++ compile features for the Cray
+      # compiler.  Set compiler options transitively here such that they trickle
+      # down to a call to target_compile_options().
+      set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "-hstd=c++11")
+      set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "-hstd=c++11")
+      set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-hstd=c++11")
+      set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-hstd=c++11")
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL PGI)
+      # CMAKE doesn't support CXX_STANDARD or C++ compile features for the PGI
+      # compiler.  Set compiler options transitively here such that they trickle
+      # down to a call to target_compile_options().
+      set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "--c++11")
+      set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "--c++11")
+      set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "--c++11")
+      set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "--c++11")
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL XL)
+      # CMAKE doesn't support CXX_STANDARD or C++ compile features for the XL
+      # compiler.  Set compiler options transitively here such that they trickle
+      # down to a call to target_compile_options().
+      set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "-std=c++11")
+      set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "-std=c++11")
+      set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-std=c++11")
+      set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-std=c++11")
+    else()
+      # Assume GNU.  CMAKE_CXX_STANDARD is handled correctly by CMake 3.1 and
+      # above for this compiler.  If the user explicitly requests a C++
+      # standard, CMake takes care of it.  If not, transitively require C++11.
+      if(NOT CMAKE_CXX_STANDARD)
+        set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION ${CMAKE_CXX11_STANDARD_COMPILE_OPTION})
+        set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION ${CMAKE_CXX11_EXTENSION_COMPILE_OPTION})
+      endif()
+    endif()
+
+    # Set the C++ standard info for Kokkos respecting user set values for
+    # CMAKE_CXX_STANDARD and CMAKE_CXX_EXTENSIONS.
+    # Only use cxx extension if explicitly requested
+    if(CMAKE_CXX_STANDARD EQUAL 14)
+      if(DEFINED CMAKE_CXX_EXTENSIONS AND CMAKE_CXX_EXTENSIONS STREQUAL ON)
+        set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX14_EXTENSION_COMPILE_OPTION})
+      else()
+        set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX14_STANDARD_COMPILE_OPTION})
+      endif()
+    elseif(CMAKE_CXX_STANDARD EQUAL 11)
+      if(DEFINED CMAKE_CXX_EXTENSIONS AND CMAKE_CXX_EXTENSIONS STREQUAL ON)
+        set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_EXTENSION_COMPILE_OPTION})
+      else()
+        set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_STANDARD_COMPILE_OPTION})
+      endif()
+    else()
+      # The user didn't explicitly request a standard, transitively require
+      # C++11 respecting CMAKE_CXX_EXTENSIONS.
+      if(DEFINED CMAKE_CXX_EXTENSIONS AND CMAKE_CXX_EXTENSIONS STREQUAL ON)
+        set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_EXTENSION_COMPILE_OPTION})
+      else()
+        set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_STANDARD_COMPILE_OPTION})
+      endif()
+    endif()
+
+    set(KOKKOS_CXX_FLAGS ${INTERNAL_CXX_FLAGS} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+#-------------------------------------------------------------------------------
+# function(set_kokkos_sources)
+# Takes a list of sources for kokkos (e.g., KOKKOS_SRC from Makefile.kokkos and
+# put it into kokkos_generated_settings.cmake) and sorts the files into the subpackages or
+# separate_libraries.  This is core and containers (algorithms is pure header
+# files).
+#
+# Inputs:
+#   KOKKOS_SRC
+# 
+# Outputs:
+#   KOKKOS_CORE_SRCS
+#   KOKKOS_CONTAINERS_SRCS
+#
+function(set_kokkos_srcs)
+  set(opts ) # no-value args
+  set(oneValArgs )
+  set(multValArgs KOKKOS_SRC) # e.g., lists
+  cmake_parse_arguments(IN "${opts}" "${oneValArgs}" "${multValArgs}" ${ARGN})
+
+  foreach(sfile ${IN_KOKKOS_SRC})
+     string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" stripfile "${sfile}")
+     string(REPLACE "/" ";" striplist "${stripfile}")
+     list(GET striplist 0 firstdir)
+     if(${firstdir} STREQUAL "core")
+       list(APPEND KOKKOS_CORE_SRCS ${sfile})
+     else()
+       list(APPEND KOKKOS_CONTAINERS_SRCS ${sfile})
+     endif()
+  endforeach()
+  set(KOKKOS_CORE_SRCS ${KOKKOS_CORE_SRCS} PARENT_SCOPE)
+  set(KOKKOS_CONTAINERS_SRCS ${KOKKOS_CONTAINERS_SRCS} PARENT_SCOPE)
+  return()
+endfunction()
+
+# Setting a default value if it is not already set
+macro(set_kokkos_default_default VARIABLE DEFAULT)
+  IF( "${KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT}" STREQUAL "" )
+    IF( "${KOKKOS_ENABLE_${VARIABLE}}" STREQUAL "" )
+      set(KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT ${DEFAULT})
+  #    MESSAGE(WARNING "Set: KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT to ${KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT}")
+    ELSE()
+      set(KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT ${KOKKOS_ENABLE_${VARIABLE}})
+   #   MESSAGE(WARNING "Set: KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT to ${KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT}")
+    ENDIF()
+  ENDIF()
+  UNSET(KOKKOS_ENABLE_${VARIABLE} CACHE)
+endmacro()
diff --git a/packages/kokkos/cmake/kokkos_options.cmake b/packages/kokkos/cmake/kokkos_options.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..25eb8e86ced2e549eefa1d1c0788ffc41654cbd8
--- /dev/null
+++ b/packages/kokkos/cmake/kokkos_options.cmake
@@ -0,0 +1,367 @@
+########################## NOTES ###############################################
+#  List the options for configuring kokkos using CMake method of doing it.
+#  These options then get mapped onto KOKKOS_SETTINGS environment variable by
+#  kokkos_settings.cmake.  It is separate to allow other packages to override
+#  these variables (e.g., TriBITS).
+
+########################## AVAILABLE OPTIONS ###################################
+# Use lists for documentation, verification, and programming convenience
+
+# All CMake options of the type KOKKOS_ENABLE_*
+set(KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST)
+list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
+     Serial
+     OpenMP
+     Pthread
+     Qthread
+     Cuda
+     ROCm
+     HWLOC
+     MEMKIND
+     LIBRT
+     Cuda_Lambda
+     Cuda_Relocatable_Device_Code
+     Cuda_UVM
+     Cuda_LDG_Intrinsic
+     Debug
+     Debug_DualView_Modify_Check
+     Debug_Bounds_Check
+     Compiler_Warnings
+     Profiling
+     Profiling_Load_Print
+     Aggressive_Vectorization
+     Deprecated_Code
+     )
+
+#-------------------------------------------------------------------------------
+#------------------------------- Recognize CamelCase Options ---------------------------
+#-------------------------------------------------------------------------------
+
+foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
+  string(TOUPPER ${opt} OPT )
+  IF(DEFINED Kokkos_ENABLE_${opt})
+    IF(DEFINED KOKKOS_ENABLE_${OPT})
+      IF(NOT ("${KOKKOS_ENABLE_${OPT}}" STREQUAL "${Kokkos_ENABLE_${opt}}"))
+        IF(DEFINED KOKKOS_ENABLE_${OPT}_INTERNAL)
+          MESSAGE(WARNING  "Defined both Kokkos_ENABLE_${opt}=[${Kokkos_ENABLE_${opt}}] and KOKKOS_ENABLE_${OPT}=[${KOKKOS_ENABLE_${OPT}}] and they differ! Could be caused by old CMakeCache Variable. Run CMake again and warning should disappear. If not you are truly setting both variables.")
+          IF(NOT ("${Kokkos_ENABLE_${opt}}" STREQUAL "${KOKKOS_ENABLE_${OPT}_INTERNAL}"))
+            UNSET(KOKKOS_ENABLE_${OPT} CACHE)
+            SET(KOKKOS_ENABLE_${OPT} ${Kokkos_ENABLE_${opt}})
+            MESSAGE(WARNING "SET BOTH VARIABLES KOKKOS_ENABLE_${OPT}: ${KOKKOS_ENABLE_${OPT}}")
+          ELSE()
+            SET(Kokkos_ENABLE_${opt} ${KOKKOS_ENABLE_${OPT}})
+          ENDIF()
+        ELSE()
+          MESSAGE(FATAL_ERROR "Defined both Kokkos_ENABLE_${opt}=[${Kokkos_ENABLE_${opt}}] and KOKKOS_ENABLE_${OPT}=[${KOKKOS_ENABLE_${OPT}}] and they differ!")
+        ENDIF()
+      ENDIF()
+    ELSE()
+      SET(KOKKOS_INTERNAL_ENABLE_${OPT}_DEFAULT ${Kokkos_ENABLE_${opt}})
+    ENDIF()
+  ENDIF()
+endforeach()
+
+IF(DEFINED Kokkos_Arch)
+  IF(DEFINED KOKKOS_ARCH)
+    IF(NOT (${KOKKOS_ARCH} STREQUAL "${Kokkos_Arch}"))
+      MESSAGE(FATAL_ERROR "Defined both Kokkos_Arch and KOKKOS_ARCH and they differ!")
+    ENDIF()
+  ELSE()
+    SET(KOKKOS_ARCH ${Kokkos_Arch})
+  ENDIF()
+ENDIF()
+  
+#-------------------------------------------------------------------------------
+# List of possible host architectures.
+#-------------------------------------------------------------------------------
+set(KOKKOS_ARCH_LIST)
+list(APPEND KOKKOS_ARCH_LIST
+     None            # No architecture optimization
+     AMDAVX          # (HOST) AMD chip
+     ARMv80          # (HOST) ARMv8.0 Compatible CPU
+     ARMv81          # (HOST) ARMv8.1 Compatible CPU
+     ARMv8-ThunderX  # (HOST) ARMv8 Cavium ThunderX CPU
+     WSM             # (HOST) Intel Westmere CPU
+     SNB             # (HOST) Intel Sandy/Ivy Bridge CPUs
+     HSW             # (HOST) Intel Haswell CPUs
+     BDW             # (HOST) Intel Broadwell Xeon E-class CPUs
+     SKX             # (HOST) Intel Sky Lake Xeon E-class HPC CPUs (AVX512)
+     KNC             # (HOST) Intel Knights Corner Xeon Phi
+     KNL             # (HOST) Intel Knights Landing Xeon Phi
+     BGQ             # (HOST) IBM Blue Gene Q
+     Power7          # (HOST) IBM POWER7 CPUs
+     Power8          # (HOST) IBM POWER8 CPUs
+     Power9          # (HOST) IBM POWER9 CPUs
+     Kepler          # (GPU) NVIDIA Kepler default (generation CC 3.5)
+     Kepler30        # (GPU) NVIDIA Kepler generation CC 3.0
+     Kepler32        # (GPU) NVIDIA Kepler generation CC 3.2
+     Kepler35        # (GPU) NVIDIA Kepler generation CC 3.5
+     Kepler37        # (GPU) NVIDIA Kepler generation CC 3.7
+     Maxwell         # (GPU) NVIDIA Maxwell default (generation CC 5.0)
+     Maxwell50       # (GPU) NVIDIA Maxwell generation CC 5.0
+     Maxwell52       # (GPU) NVIDIA Maxwell generation CC 5.2
+     Maxwell53       # (GPU) NVIDIA Maxwell generation CC 5.3
+     Pascal60        # (GPU) NVIDIA Pascal generation CC 6.0
+     Pascal61        # (GPU) NVIDIA Pascal generation CC 6.1
+    )
+
+# List of possible device architectures.
+# The case and spelling here needs to match Makefile.kokkos
+set(KOKKOS_DEVICES_LIST)
+# Options: Cuda,ROCm,OpenMP,Pthread,Qthreads,Serial
+list(APPEND KOKKOS_DEVICES_LIST
+    Cuda          # NVIDIA GPU -- see below
+    OpenMP        # OpenMP
+    Pthread       # pthread
+    Qthreads      # qthreads
+    Serial        # serial
+    ROCm          # Relocatable device code
+    )
+
+# List of possible TPLs for Kokkos
+# From Makefile.kokkos: Options: hwloc,librt,experimental_memkind
+set(KOKKOS_USE_TPLS_LIST)
+list(APPEND KOKKOS_USE_TPLS_LIST
+    HWLOC          # hwloc
+    LIBRT          # librt
+    MEMKIND        # experimental_memkind
+    )
+# Map of cmake variables to Makefile variables
+set(KOKKOS_INTERNAL_HWLOC hwloc)
+set(KOKKOS_INTERNAL_LIBRT librt)
+set(KOKKOS_INTERNAL_MEMKIND experimental_memkind)
+
+# List of possible Advanced options
+set(KOKKOS_OPTIONS_LIST)
+list(APPEND KOKKOS_OPTIONS_LIST
+       AGGRESSIVE_VECTORIZATION    
+       DISABLE_PROFILING          
+       DISABLE_DUALVIEW_MODIFY_CHECK
+       ENABLE_PROFILE_LOAD_PRINT   
+    )
+# Map of cmake variables to Makefile variables
+set(KOKKOS_INTERNAL_LDG_INTRINSIC use_ldg)
+set(KOKKOS_INTERNAL_UVM librt)
+set(KOKKOS_INTERNAL_RELOCATABLE_DEVICE_CODE rdc)
+
+
+#-------------------------------------------------------------------------------
+# List of possible Options for CUDA
+#-------------------------------------------------------------------------------
+# From Makefile.kokkos: Options: use_ldg,force_uvm,rdc
+set(KOKKOS_CUDA_OPTIONS_LIST)
+list(APPEND KOKKOS_CUDA_OPTIONS_LIST
+    LDG_INTRINSIC              # use_ldg
+    UVM                        # force_uvm
+    RELOCATABLE_DEVICE_CODE    # rdc
+    LAMBDA                     # enable_lambda
+    )
+    
+# Map of cmake variables to Makefile variables
+set(KOKKOS_INTERNAL_LDG_INTRINSIC use_ldg)
+set(KOKKOS_INTERNAL_UVM force_uvm)
+set(KOKKOS_INTERNAL_RELOCATABLE_DEVICE_CODE rdc)
+set(KOKKOS_INTERNAL_LAMBDA enable_lambda)
+
+
+#-------------------------------------------------------------------------------
+#------------------------------- Create doc strings ----------------------------
+#-------------------------------------------------------------------------------
+
+set(tmpr "\n       ")
+string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_ARCH_DOCSTR "${KOKKOS_ARCH_LIST}")
+# This would be useful, but we use Foo_ENABLE mechanisms
+#string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_DEVICES_DOCSTR "${KOKKOS_DEVICES_LIST}")
+#string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_USE_TPLS_DOCSTR "${KOKKOS_USE_TPLS_LIST}")
+#string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_CUDA_OPTIONS_DOCSTR "${KOKKOS_CUDA_OPTIONS_LIST}")
+
+#-------------------------------------------------------------------------------
+#------------------------------- GENERAL OPTIONS -------------------------------
+#-------------------------------------------------------------------------------
+
+# Setting this variable to a value other than "None" can improve host
+# performance by turning on architecture specific code.
+# NOT SET is used to determine if the option is passed in.  It is reset to
+# default "None" down below.
+set(KOKKOS_ARCH "NOT_SET" CACHE STRING 
+      "Optimize for specific host architecture. Options are: ${KOKKOS_INTERNAL_ARCH_DOCSTR}")
+
+# Whether to build separate libraries or now
+set(KOKKOS_SEPARATE_LIBS OFF CACHE BOOL "OFF = kokkos.  ON = kokkoscore, kokkoscontainers, and kokkosalgorithms.")
+
+# Qthreads options.
+set(KOKKOS_QTHREADS_DIR "" CACHE PATH "Location of Qthreads library.")
+
+
+#-------------------------------------------------------------------------------
+#------------------------------- KOKKOS_DEVICES --------------------------------
+#-------------------------------------------------------------------------------
+# Figure out default settings
+IF(Trilinos_ENABLE_Kokkos)             
+  set_kokkos_default_default(SERIAL ON)
+  set_kokkos_default_default(PTHREAD OFF)
+  IF(TPL_ENABLE_QTHREAD)
+    set_kokkos_default_default(QTHREADS ${TPL_ENABLE_QTHREAD})
+  ELSE()
+    set_kokkos_default_default(QTHREADS OFF)
+  ENDIF()
+  IF(Trilinos_ENABLE_OpenMP)
+    set_kokkos_default_default(OPENMP ${Trilinos_ENABLE_OpenMP})
+  ELSE()
+    set_kokkos_default_default(OPENMP OFF)
+  ENDIF()
+  IF(TPL_ENABLE_CUDA)
+    set_kokkos_default_default(CUDA ${TPL_ENABLE_CUDA})
+  ELSE()
+    set_kokkos_default_default(CUDA OFF)
+  ENDIF()
+  set_kokkos_default_default(ROCM OFF)
+ELSE()
+  set_kokkos_default_default(SERIAL ON)
+  set_kokkos_default_default(OPENMP OFF)
+  set_kokkos_default_default(PTHREAD OFF)
+  set_kokkos_default_default(QTHREAD OFF)
+  set_kokkos_default_default(CUDA OFF)
+  set_kokkos_default_default(ROCM OFF)
+ENDIF()
+
+# Set which Kokkos backend to use.
+# These are the actual options that define the settings.
+set(KOKKOS_ENABLE_SERIAL ${KOKKOS_INTERNAL_ENABLE_SERIAL_DEFAULT} CACHE BOOL "Whether to enable the Kokkos::Serial device.  This device executes \"parallel\" kernels sequentially on a single CPU thread.  It is enabled by default.  If you disable this device, please enable at least one other CPU device, such as Kokkos::OpenMP or Kokkos::Threads.")
+set(KOKKOS_ENABLE_OPENMP ${KOKKOS_INTERNAL_ENABLE_OPENMP_DEFAULT} CACHE BOOL "Enable OpenMP support in Kokkos." FORCE)
+set(KOKKOS_ENABLE_PTHREAD ${KOKKOS_INTERNAL_ENABLE_PTHREAD_DEFAULT} CACHE BOOL "Enable Pthread support in Kokkos.")
+set(KOKKOS_ENABLE_QTHREADS ${KOKKOS_INTERNAL_ENABLE_QTHREADS_DEFAULT} CACHE BOOL "Enable Qthreads support in Kokkos.")
+set(KOKKOS_ENABLE_CUDA ${KOKKOS_INTERNAL_ENABLE_CUDA_DEFAULT} CACHE BOOL "Enable CUDA support in Kokkos.")
+set(KOKKOS_ENABLE_ROCM ${KOKKOS_INTERNAL_ENABLE_ROCM_DEFAULT} CACHE BOOL "Enable ROCm support in Kokkos.")
+
+
+
+#-------------------------------------------------------------------------------
+#------------------------------- KOKKOS DEBUG and PROFILING --------------------
+#-------------------------------------------------------------------------------
+
+# Debug related options enable compiler warnings
+
+set_kokkos_default_default(DEBUG OFF)
+set(KOKKOS_ENABLE_DEBUG ${KOKKOS_INTERNAL_ENABLE_DEBUG_DEFAULT} CACHE BOOL "Enable Kokkos Debug.")
+
+# From Makefile.kokkos: Advanced Options: 
+#compiler_warnings, aggressive_vectorization, disable_profiling, disable_dualview_modify_check, enable_profile_load_print
+set_kokkos_default_default(COMPILER_WARNINGS OFF)
+set(KOKKOS_ENABLE_COMPILER_WARNINGS ${KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS_DEFAULT} CACHE BOOL "Enable compiler warnings.")
+
+set_kokkos_default_default(DEBUG_DUALVIEW_MODIFY_CHECK OFF)
+set(KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK ${KOKKOS_INTERNAL_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK_DEFAULT} CACHE BOOL "Enable dualview modify check.")
+
+# Enable aggressive vectorization.
+set_kokkos_default_default(AGGRESSIVE_VECTORIZATION OFF)
+set(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ${KOKKOS_INTERNAL_ENABLE_AGGRESSIVE_VECTORIZATION_DEFAULT} CACHE BOOL "Enable aggressive vectorization.")
+
+# Enable profiling.
+set_kokkos_default_default(PROFILING ON)
+set(KOKKOS_ENABLE_PROFILING ${KOKKOS_INTERNAL_ENABLE_PROFILING_DEFAULT} CACHE BOOL "Enable profiling.")
+
+set_kokkos_default_default(PROFILING_LOAD_PRINT OFF)
+set(KOKKOS_ENABLE_PROFILING_LOAD_PRINT ${KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT_DEFAULT} CACHE BOOL "Enable profile load print.")
+
+set_kokkos_default_default(DEPRECATED_CODE ON)
+set(KOKKOS_ENABLE_DEPRECATED_CODE ${KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE_DEFAULT} CACHE BOOL "Enable deprecated code.")
+
+
+#-------------------------------------------------------------------------------
+#------------------------------- KOKKOS_USE_TPLS -------------------------------
+#-------------------------------------------------------------------------------
+# Enable hwloc library.
+# Figure out default:
+IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HWLOC)
+  set_kokkos_default_default(HWLOC ON)
+ELSE()
+  set_kokkos_default_default(HWLOC OFF)
+ENDIF()
+set(KOKKOS_ENABLE_HWLOC ${KOKKOS_INTERNAL_ENABLE_HWLOC_DEFAULT} CACHE BOOL "Enable hwloc for better process placement.")
+set(KOKKOS_HWLOC_DIR "" CACHE PATH "Location of hwloc library. (kokkos tpl)")
+
+# Enable memkind library.
+set_kokkos_default_default(MEMKIND OFF)
+set(KOKKOS_ENABLE_MEMKIND ${KOKKOS_INTERNAL_ENABLE_MEMKIND_DEFAULT} CACHE BOOL "Enable memkind. (kokkos tpl)")
+set(KOKKOS_MEMKIND_DIR "" CACHE PATH "Location of memkind library. (kokkos tpl)")
+
+# Enable rt library.
+IF(Trilinos_ENABLE_Kokkos)
+  IF(DEFINED TPL_ENABLE_LIBRT)
+    set_kokkos_default_default(LIBRT ${TPL_ENABLE_LIBRT})
+  ELSE()
+    set_kokkos_default_default(LIBRT OFF)
+  ENDIF()
+ELSE()
+  set_kokkos_default_default(LIBRT ON)
+ENDIF()
+set(KOKKOS_ENABLE_LIBRT ${KOKKOS_INTERNAL_ENABLE_LIBRT_DEFAULT} CACHE BOOL "Enable librt for more precise timer.  (kokkos tpl)")
+
+
+#-------------------------------------------------------------------------------
+#------------------------------- KOKKOS_CUDA_OPTIONS ---------------------------
+#-------------------------------------------------------------------------------
+
+# CUDA options.
+# Set Defaults
+set_kokkos_default_default(CUDA_LDG_INTRINSIC_DEFAULT OFF)
+set_kokkos_default_default(CUDA_UVM_DEFAULT OFF)
+set_kokkos_default_default(CUDA_RELOCATABLE_DEVICE_CODE OFF)
+IF(Trilinos_ENABLE_Kokkos)
+  IF(KOKKOS_ENABLE_CUDA)
+    find_package(CUDA)
+  ENDIF()
+  IF (DEFINED CUDA_VERSION)
+    IF (CUDA_VERSION VERSION_GREATER "7.0")
+      set_kokkos_default_default(CUDA_LAMBDA ON)
+    ELSE()
+      set_kokkos_default_default(CUDA_LAMBDA OFF)
+    ENDIF()
+  ENDIF()
+ELSE()
+  set_kokkos_default_default(CUDA_LAMBDA OFF)
+ENDIF()
+
+# Set actual options
+set(KOKKOS_CUDA_DIR "" CACHE PATH "Location of CUDA library.  Defaults to where nvcc installed.")
+set(KOKKOS_ENABLE_CUDA_LDG_INTRINSIC ${KOKKOS_INTERNAL_ENABLE_CUDA_LDG_INTRINSIC_DEFAULT} CACHE BOOL "Enable CUDA LDG. (cuda option)") 
+set(KOKKOS_ENABLE_CUDA_UVM ${KOKKOS_INTERNAL_ENABLE_CUDA_UVM_DEFAULT} CACHE BOOL "Enable CUDA unified virtual memory.")
+set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ${KOKKOS_INTERNAL_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE_DEFAULT} CACHE BOOL "Enable relocatable device code for CUDA. (cuda option)")
+set(KOKKOS_ENABLE_CUDA_LAMBDA ${KOKKOS_INTERNAL_ENABLE_CUDA_LAMBDA_DEFAULT} CACHE BOOL "Enable lambdas for CUDA. (cuda option)")
+
+
+#-------------------------------------------------------------------------------
+#----------------------- HOST ARCH AND LEGACY TRIBITS --------------------------
+#-------------------------------------------------------------------------------
+
+# This defines the previous legacy TriBITS builds. 
+set(KOKKOS_LEGACY_TRIBITS False)
+IF ("${KOKKOS_ARCH}" STREQUAL "NOT_SET")
+  set(KOKKOS_ARCH "None")
+  IF(KOKKOS_HAS_TRILINOS)
+    set(KOKKOS_LEGACY_TRIBITS True)
+  ENDIF()
+ENDIF()
+IF (KOKKOS_HAS_TRILINOS)
+  IF (KOKKOS_LEGACY_TRIBITS)
+    message(STATUS "Using the legacy tribits build because KOKKOS_ARCH not set")
+  ELSE()
+    message(STATUS "NOT using the legacy tribits build because KOKKOS_ARCH *is* set")
+  ENDIF()
+ENDIF()
+
+#-------------------------------------------------------------------------------
+#----------------------- Set CamelCase Options if they are not yet set ---------
+#-------------------------------------------------------------------------------
+
+foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
+  string(TOUPPER ${opt} OPT )
+  UNSET(KOKKOS_ENABLE_${OPT}_INTERNAL CACHE)
+  SET(KOKKOS_ENABLE_${OPT}_INTERNAL ${KOKKOS_ENABLE_${OPT}} CACHE BOOL INTERNAL)
+  IF(DEFINED KOKKOS_ENABLE_${OPT})
+    UNSET(Kokkos_ENABLE_${opt} CACHE)
+    SET(Kokkos_ENABLE_${opt} ${KOKKOS_ENABLE_${OPT}} CACHE BOOL "CamelCase Compatibility setting for KOKKOS_ENABLE_${OPT}")
+  ENDIF()
+endforeach()
+
diff --git a/packages/kokkos/cmake/kokkos_settings.cmake b/packages/kokkos/cmake/kokkos_settings.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..579fab0c954c230d7690cda72a854d332d598be1
--- /dev/null
+++ b/packages/kokkos/cmake/kokkos_settings.cmake
@@ -0,0 +1,236 @@
+########################## NOTES ###############################################
+# This files goal is to take CMake options found in kokkos_options.cmake but 
+# possibly set from elsewhere 
+#   (see: trilinos/cmake/ProjectCOmpilerPostConfig.cmake) 
+# using CMake idioms and map them onto the KOKKOS_SETTINGS variables that gets 
+# passed to the kokkos makefile configuration:
+#  make -f ${CMAKE_SOURCE_DIR}/core/src/Makefile ${KOKKOS_SETTINGS} build-makefile-cmake-kokkos
+# that generates KokkosCore_config.h and kokkos_generated_settings.cmake
+# To understand how to form KOKKOS_SETTINGS, see
+#     <KOKKOS_PATH>/Makefile.kokkos
+
+#-------------------------------------------------------------------------------
+#------------------------------- GENERAL OPTIONS -------------------------------
+#-------------------------------------------------------------------------------
+
+# Ensure that KOKKOS_ARCH is in the ARCH_LIST
+if (KOKKOS_ARCH MATCHES ",")
+  message("-- Detected a comma in: KOKKOS_ARCH=${KOKKOS_ARCH}")
+  message("-- Although we prefer KOKKOS_ARCH to be semicolon-delimited, we do allow")
+  message("-- comma-delimited values for compatibility with scripts (see github.com/trilinos/Trilinos/issues/2330)")
+  string(REPLACE "," ";" KOKKOS_ARCH "${KOKKOS_ARCH}")
+  message("-- Commas were changed to semicolons, now KOKKOS_ARCH=${KOKKOS_ARCH}")
+endif()
+foreach(arch ${KOKKOS_ARCH})
+  list(FIND KOKKOS_ARCH_LIST ${arch} indx)
+  if (indx EQUAL -1)
+    message(FATAL_ERROR "${arch} is not an accepted value for KOKKOS_ARCH."
+      "  Please pick from these choices: ${KOKKOS_INTERNAL_ARCH_DOCSTR}")
+  endif ()
+endforeach()
+
+# KOKKOS_SETTINGS uses KOKKOS_ARCH
+string(REPLACE ";" "," KOKKOS_GMAKE_ARCH "${KOKKOS_ARCH}")
+
+# From Makefile.kokkos: Options: yes,no
+if(${KOKKOS_ENABLE_DEBUG})
+  set(KOKKOS_GMAKE_DEBUG yes)
+else()
+  set(KOKKOS_GMAKE_DEBUG no)
+endif()
+
+#------------------------------- KOKKOS_DEVICES --------------------------------
+# Can have multiple devices 
+set(KOKKOS_DEVICESl)
+foreach(devopt ${KOKKOS_DEVICES_LIST})
+  string(TOUPPER ${devopt} devoptuc)
+  if (${KOKKOS_ENABLE_${devoptuc}}) 
+    list(APPEND KOKKOS_DEVICESl ${devopt})
+  endif ()
+endforeach()
+# List needs to be comma-delmitted
+string(REPLACE ";" "," KOKKOS_GMAKE_DEVICES "${KOKKOS_DEVICESl}")
+
+#------------------------------- KOKKOS_OPTIONS --------------------------------
+# From Makefile.kokkos: Options: aggressive_vectorization,disable_profiling,disable_deprecated_code
+#compiler_warnings, aggressive_vectorization, disable_profiling, disable_dualview_modify_check, enable_profile_load_print
+
+set(KOKKOS_OPTIONSl)
+if(${KOKKOS_ENABLE_COMPILER_WARNINGS})
+      list(APPEND KOKKOS_OPTIONSl compiler_warnings)
+endif()
+if(${KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION})
+      list(APPEND KOKKOS_OPTIONSl aggressive_vectorization)
+endif()
+if(NOT ${KOKKOS_ENABLE_PROFILING})
+      list(APPEND KOKKOS_OPTIONSl disable_profiling)
+endif()
+if(NOT ${KOKKOS_ENABLE_DEPRECATED_CODE})
+      list(APPEND KOKKOS_OPTIONSl disable_deprecated_code)
+endif()
+if(NOT ${KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK})
+      list(APPEND KOKKOS_OPTIONSl disable_dualview_modify_check)
+endif()
+if(${KOKKOS_ENABLE_PROFILING_LOAD_PRINT})
+      list(APPEND KOKKOS_OPTIONSl enable_profile_load_print)
+endif()
+# List needs to be comma-delimitted
+string(REPLACE ";" "," KOKKOS_GMAKE_OPTIONS "${KOKKOS_OPTIONSl}")
+
+
+#------------------------------- KOKKOS_USE_TPLS -------------------------------
+# Construct the Makefile options
+set(KOKKOS_USE_TPLSl)
+foreach(tplopt ${KOKKOS_USE_TPLS_LIST})
+  if (${KOKKOS_ENABLE_${tplopt}}) 
+    list(APPEND KOKKOS_USE_TPLSl ${KOKKOS_INTERNAL_${tplopt}})
+  endif ()
+endforeach()
+# List needs to be comma-delimitted
+string(REPLACE ";" "," KOKKOS_GMAKE_USE_TPLS "${KOKKOS_USE_TPLSl}")
+
+
+#------------------------------- KOKKOS_CUDA_OPTIONS ---------------------------
+# Construct the Makefile options
+set(KOKKOS_CUDA_OPTIONSl)
+foreach(cudaopt ${KOKKOS_CUDA_OPTIONS_LIST})
+  if (${KOKKOS_ENABLE_CUDA_${cudaopt}})
+    list(APPEND KOKKOS_CUDA_OPTIONSl ${KOKKOS_INTERNAL_${cudaopt}})
+  endif ()
+endforeach()
+# List needs to be comma-delmitted
+string(REPLACE ";" "," KOKKOS_GMAKE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONSl}")
+
+#------------------------------- PATH VARIABLES --------------------------------
+#  Want makefile to use same executables specified which means modifying
+#  the path so the $(shell ...) commands in the makefile see the right exec
+#  Also, the Makefile's use FOO_PATH naming scheme for -I/-L construction
+#TODO:  Makefile.kokkos allows this to be overwritten? ROCM_HCC_PATH
+
+set(KOKKOS_INTERNAL_PATHS)
+set(addpathl)
+foreach(kvar IN LISTS KOKKOS_USE_TPLS_LIST ITEMS CUDA QTHREADS)
+  if(${KOKKOS_ENABLE_${kvar}})
+    if(DEFINED KOKKOS_${kvar}_DIR)
+      set(KOKKOS_INTERNAL_PATHS ${KOKKOS_INTERNAL_PATHS} "${kvar}_PATH=${KOKKOS_${kvar}_DIR}")
+      if(IS_DIRECTORY ${KOKKOS_${kvar}_DIR}/bin)
+        list(APPEND addpathl ${KOKKOS_${kvar}_DIR}/bin)
+      endif()
+    endif()
+  endif()
+endforeach()
+# Path env is : delimitted
+string(REPLACE ";" ":" KOKKOS_INTERNAL_ADDTOPATH "${addpathl}")
+
+
+######################### SET KOKKOS_SETTINGS ##################################
+# Set the KOKKOS_SETTINGS String -- this is the primary communication with the
+# makefile configuration.  See Makefile.kokkos
+
+set(KOKKOS_SETTINGS KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH})
+set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_PATH=${KOKKOS_PATH})
+set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_INSTALL_PATH=${CMAKE_INSTALL_PREFIX})
+
+# Form of KOKKOS_foo=$KOKKOS_foo
+foreach(kvar ARCH;DEVICES;DEBUG;OPTIONS;CUDA_OPTIONS;USE_TPLS)
+  if(DEFINED KOKKOS_GMAKE_${kvar})
+    if (NOT "${KOKKOS_GMAKE_${kvar}}" STREQUAL "")
+      set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_${kvar}=${KOKKOS_GMAKE_${kvar}})
+    endif()
+  endif()
+endforeach()
+
+# Form of VAR=VAL
+#TODO:  Makefile supports MPICH_CXX, OMPI_CXX as well
+foreach(ovar CXX;CXXFLAGS;LDFLAGS)
+  if(DEFINED ${ovar})
+    if (NOT "${${ovar}}" STREQUAL "")
+      set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} ${ovar}=${${ovar}})
+    endif()
+  endif()
+endforeach()
+
+# Finally, do the paths
+if (NOT "${KOKKOS_INTERNAL_PATHS}" STREQUAL "")
+  set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} ${KOKKOS_INTERNAL_PATHS})
+endif()
+if (NOT "${KOKKOS_INTERNAL_ADDTOPATH}" STREQUAL "")
+  set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} "PATH=\"${KOKKOS_INTERNAL_ADDTOPATH}:$ENV{PATH}\"")
+endif()
+
+# Final form that gets passed to make
+set(KOKKOS_SETTINGS env ${KOKKOS_SETTINGS})
+
+
+############################ PRINT CONFIGURE STATUS ############################
+
+if(KOKKOS_CMAKE_VERBOSE)
+  message(STATUS "")
+  message(STATUS "****************** Kokkos Settings ******************")
+  message(STATUS "Execution Spaces")
+
+  if(KOKKOS_ENABLE_CUDA)
+    message(STATUS "  Device Parallel: Cuda")
+  else()
+    message(STATUS "  Device Parallel: None")
+  endif()
+
+  if(KOKKOS_ENABLE_OPENMP)
+    message(STATUS "    Host Parallel: OpenMP")
+  elseif(KOKKOS_ENABLE_PTHREAD)
+    message(STATUS "    Host Parallel: Pthread")
+  elseif(KOKKOS_ENABLE_QTHREADS)
+    message(STATUS "    Host Parallel: Qthreads")
+  else()
+    message(STATUS "    Host Parallel: None")
+  endif()
+
+  if(KOKKOS_ENABLE_SERIAL)
+    message(STATUS "      Host Serial: Serial")
+  else()
+    message(STATUS "      Host Serial: None")
+  endif()
+
+  message(STATUS "")
+  message(STATUS "Architectures:")
+  message(STATUS "    ${KOKKOS_GMAKE_ARCH}")
+
+  message(STATUS "")
+  message(STATUS "Enabled options")
+
+  if(KOKKOS_SEPARATE_LIBS)
+    message(STATUS "  KOKKOS_SEPARATE_LIBS")
+  endif()
+
+  foreach(opt IN LISTS KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST)
+    string(TOUPPER ${opt} OPT)
+    if (KOKKOS_ENABLE_${OPT})
+      message(STATUS "  KOKKOS_ENABLE_${OPT}")
+    endif()
+  endforeach()
+
+  if(KOKKOS_ENABLE_CUDA)
+    if(KOKKOS_CUDA_DIR)
+      message(STATUS "  KOKKOS_CUDA_DIR: ${KOKKOS_CUDA_DIR}")
+    endif()
+  endif()
+
+  if(KOKKOS_QTHREADS_DIR)
+    message(STATUS "  KOKKOS_QTHREADS_DIR: ${KOKKOS_QTHREADS_DIR}")
+  endif()
+
+  if(KOKKOS_HWLOC_DIR)
+    message(STATUS "  KOKKOS_HWLOC_DIR: ${KOKKOS_HWLOC_DIR}")
+  endif()
+
+  if(KOKKOS_MEMKIND_DIR)
+    message(STATUS "  KOKKOS_MEMKIND_DIR: ${KOKKOS_MEMKIND_DIR}")
+  endif()
+
+  message(STATUS "")
+  message(STATUS "Final kokkos settings variable:")
+  message(STATUS "  ${KOKKOS_SETTINGS}")
+
+  message(STATUS "*****************************************************")
+  message(STATUS "")
+endif()
diff --git a/packages/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake b/packages/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..aad1e2bad7629f3f43ca91135752253a20ac9523
--- /dev/null
+++ b/packages/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake
@@ -0,0 +1,75 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+# Check for CUDA support
+
+IF (NOT TPL_ENABLE_CUDA OR CUDA_VERSION VERSION_LESS "4.1")
+  MESSAGE(FATAL_ERROR "\nCUSPARSE: did not find acceptable version of CUDA libraries (4.1 or greater)")
+ELSE()
+  IF(CMAKE_VERSION VERSION_LESS "2.8.8")
+    # FindCUDA before CMake 2.8.8 does not find cusparse library; therefore, we must
+    find_library(CUDA_cusparse_LIBRARY
+      cusparse
+      HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib
+      )
+    IF(CUDA_cusparse_LIBRARY STREQUAL "CUDA_cusparse_LIBRARY-NOTFOUND") 
+      MESSAGE(FATAL_ERROR "\nCUSPARSE: could not find cuspasre library.")
+    ENDIF()
+  ENDIF(CMAKE_VERSION VERSION_LESS "2.8.8")
+  GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
+  GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
+  GLOBAL_SET(TPL_CUSPARSE_LIBRARIES    ${CUDA_cusparse_LIBRARY})
+ENDIF()
+
diff --git a/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake b/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..715b3e9bde59379c632fbec7926b425e6189e74d
--- /dev/null
+++ b/packages/kokkos/cmake/tpls/FindTPLHWLOC.cmake
@@ -0,0 +1,71 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  November 2011
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        http://www.open-mpi.org/projects/hwloc/
+#    Version:       1.3
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
+  REQUIRED_HEADERS hwloc.h
+  REQUIRED_LIBS_NAMES "hwloc"
+  )
+
diff --git a/packages/kokkos/cmake/tpls/FindTPLPthread.cmake b/packages/kokkos/cmake/tpls/FindTPLPthread.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..fc401d7543357f18d225a33efe0cf3bb489170d7
--- /dev/null
+++ b/packages/kokkos/cmake/tpls/FindTPLPthread.cmake
@@ -0,0 +1,82 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+SET(USE_THREADS FALSE)
+
+IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
+  # Use CMake's Thread finder since it is a bit smarter in determining
+  # whether pthreads is already built into the compiler and doesn't need
+  # a library to link.
+  FIND_PACKAGE(Threads)
+  #If Threads found a copy of pthreads make sure it is one of the cases the tribits
+  #tpl system cannot handle.
+  IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
+    IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
+      SET(USE_THREADS TRUE)
+    ENDIF()
+  ENDIF()
+ENDIF()
+
+IF(USE_THREADS)
+  SET(TPL_Pthread_INCLUDE_DIRS "")
+  SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
+  SET(TPL_Pthread_LIBRARY_DIRS "")
+ELSE()
+  TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
+    REQUIRED_HEADERS pthread.h
+    REQUIRED_LIBS_NAMES pthread
+      )
+ENDIF()
diff --git a/packages/kokkos/cmake/tpls/FindTPLQTHREADS.cmake b/packages/kokkos/cmake/tpls/FindTPLQTHREADS.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c312f2590bcd29197a0cf3fbd5e0b484579a09c2
--- /dev/null
+++ b/packages/kokkos/cmake/tpls/FindTPLQTHREADS.cmake
@@ -0,0 +1,69 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  July 2014
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        https://code.google.com/p/qthreads
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
+  REQUIRED_HEADERS qthread.h
+  REQUIRED_LIBS_NAMES "qthread"
+  )
diff --git a/packages/kokkos/cmake/tribits.cmake b/packages/kokkos/cmake/tribits.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..1b5a7b2adb4572cf3b454af49ce7db13332fc7ce
--- /dev/null
+++ b/packages/kokkos/cmake/tribits.cmake
@@ -0,0 +1,521 @@
+INCLUDE(CMakeParseArguments)
+INCLUDE(CTest)
+
+cmake_policy(SET CMP0054 NEW)
+
+MESSAGE(STATUS "The project name is: ${PROJECT_NAME}")
+
+IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP)
+  SET(${PROJECT_NAME}_ENABLE_OpenMP OFF)
+ENDIF()
+
+IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_DEBUG)
+  SET(${PROJECT_NAME}_ENABLE_DEBUG OFF)
+ENDIF()
+
+IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_CXX11)
+  SET(${PROJECT_NAME}_ENABLE_CXX11 ON)
+ENDIF()
+
+IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_TESTS)
+  SET(${PROJECT_NAME}_ENABLE_TESTS OFF)
+ENDIF()
+
+IF(NOT DEFINED TPL_ENABLE_Pthread)
+  SET(TPL_ENABLE_Pthread OFF)
+ENDIF()
+
+FUNCTION(ASSERT_DEFINED VARS)
+  FOREACH(VAR ${VARS})
+    IF(NOT DEFINED ${VAR})
+      MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!")
+    ENDIF()
+  ENDFOREACH()
+ENDFUNCTION()
+
+MACRO(GLOBAL_SET VARNAME)
+  SET(${VARNAME} ${ARGN} CACHE INTERNAL "")
+ENDMACRO()
+
+MACRO(PREPEND_GLOBAL_SET VARNAME)
+  ASSERT_DEFINED(${VARNAME})
+  GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}})
+ENDMACRO()
+
+#FUNCTION(REMOVE_GLOBAL_DUPLICATES VARNAME)
+#  ASSERT_DEFINED(${VARNAME})
+#  IF (${VARNAME})
+#    SET(TMP ${${VARNAME}})
+#    LIST(REMOVE_DUPLICATES TMP)
+#    GLOBAL_SET(${VARNAME} ${TMP})
+#  ENDIF()
+#ENDFUNCTION()
+
+#MACRO(TRIBITS_ADD_OPTION_AND_DEFINE  USER_OPTION_NAME  MACRO_DEFINE_NAME DOCSTRING  DEFAULT_VALUE)
+#  MESSAGE(STATUS "TRIBITS_ADD_OPTION_AND_DEFINE: '${USER_OPTION_NAME}' '${MACRO_DEFINE_NAME}' '${DEFAULT_VALUE}'")
+#  SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" )
+#  IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "")
+#    IF(${USER_OPTION_NAME})
+#      GLOBAL_SET(${MACRO_DEFINE_NAME} ON)
+#    ELSE()
+#      GLOBAL_SET(${MACRO_DEFINE_NAME} OFF)
+#    ENDIF()
+#  ENDIF()
+#ENDMACRO()
+
+FUNCTION(TRIBITS_CONFIGURE_FILE  PACKAGE_NAME_CONFIG_FILE)
+
+  # Configure the file
+  CONFIGURE_FILE(
+    ${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE}
+    )
+
+ENDFUNCTION()
+
+#MACRO(TRIBITS_ADD_DEBUG_OPTION)
+#  TRIBITS_ADD_OPTION_AND_DEFINE(
+#    ${PROJECT_NAME}_ENABLE_DEBUG
+#    HAVE_${PROJECT_NAME_UC}_DEBUG
+#    "Enable a host of runtime debug checking."
+#    OFF
+#    )
+#ENDMACRO()
+
+
+MACRO(TRIBITS_ADD_TEST_DIRECTORIES)
+  IF(${${PROJECT_NAME}_ENABLE_TESTS})
+    FOREACH(TEST_DIR ${ARGN})
+      ADD_SUBDIRECTORY(${TEST_DIR})
+    ENDFOREACH()
+  ENDIF()
+ENDMACRO()
+
+MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES)
+  IF(${PACKAGE_NAME}_ENABLE_EXAMPLES OR ${PARENT_PACKAGE_NAME}_ENABLE_EXAMPLES)
+    FOREACH(EXAMPLE_DIR ${ARGN})
+      ADD_SUBDIRECTORY(${EXAMPLE_DIR})
+    ENDFOREACH()
+  ENDIF()
+ENDMACRO()
+
+
+function(INCLUDE_DIRECTORIES)
+  cmake_parse_arguments(INCLUDE_DIRECTORIES "REQUIRED_DURING_INSTALLATION_TESTING" "" "" ${ARGN})
+  _INCLUDE_DIRECTORIES(${INCLUDE_DIRECTORIES_UNPARSED_ARGUMENTS})
+endfunction()
+
+
+MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT)
+  SET(PROP_VALUES)
+  FOREACH(TARGET_X ${ARGN})
+    LIST(APPEND PROP_VALUES "$<TARGET_PROPERTY:${TARGET_X},${PROP_IN}>")
+  ENDFOREACH()
+  SET_TARGET_PROPERTIES(${TARGET_NAME} PROPERTIES ${PROP_OUT} "${PROP_VALUES}")
+ENDMACRO()
+
+MACRO(ADD_INTERFACE_LIBRARY LIB_NAME)
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "")
+  ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
+  SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE)
+ENDMACRO()
+
+# Older versions of cmake does not make include directories transitive
+MACRO(TARGET_LINK_AND_INCLUDE_LIBRARIES TARGET_NAME)
+  TARGET_LINK_LIBRARIES(${TARGET_NAME} LINK_PUBLIC ${ARGN})
+  FOREACH(DEP_LIB ${ARGN})
+    TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INTERFACE_INCLUDE_DIRECTORIES>)
+    TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INCLUDE_DIRECTORIES>)
+  ENDFOREACH()
+ENDMACRO()
+
+FUNCTION(TRIBITS_ADD_LIBRARY LIBRARY_NAME)
+
+  SET(options STATIC SHARED TESTONLY NO_INSTALL_LIB_OR_HEADERS CUDALIBRARY)
+  SET(oneValueArgs)
+  SET(multiValueArgs HEADERS HEADERS_INSTALL_SUBDIR NOINSTALLHEADERS SOURCES DEPLIBS IMPORTEDLIBS DEFINES ADDED_LIB_TARGET_NAME_OUT)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  IF(PARSE_HEADERS)
+    LIST(REMOVE_DUPLICATES PARSE_HEADERS)
+  ENDIF()
+  IF(PARSE_SOURCES)
+    LIST(REMOVE_DUPLICATES PARSE_SOURCES)
+  ENDIF()
+
+  # Local variable to hold all of the libraries that will be directly linked
+  # to this library.
+  SET(LINK_LIBS ${${PACKAGE_NAME}_DEPS})
+
+  # Add dependent libraries passed directly in
+
+  IF (PARSE_IMPORTEDLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
+  ENDIF()
+
+  IF (PARSE_DEPLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_DEPLIBS})
+  ENDIF()
+
+  # Add the library and all the dependencies
+
+  IF (PARSE_DEFINES)
+    ADD_DEFINITIONS(${PARSE_DEFINES})
+  ENDIF()
+
+  IF (PARSE_STATIC)
+    SET(STATIC_KEYWORD "STATIC")
+  ELSE()
+    SET(STATIC_KEYWORD)
+  ENDIF()
+
+  IF (PARSE_SHARED)
+    SET(SHARED_KEYWORD "SHARED")
+  ELSE()
+    SET(SHARED_KEYWORD)
+  ENDIF()
+
+  IF (PARSE_TESTONLY)
+    SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
+  ELSE()
+    SET(EXCLUDE_FROM_ALL_KEYWORD)
+  ENDIF()
+  IF (NOT PARSE_CUDALIBRARY)
+    ADD_LIBRARY(
+      ${LIBRARY_NAME}
+      ${STATIC_KEYWORD}
+      ${SHARED_KEYWORD}
+      ${EXCLUDE_FROM_ALL_KEYWORD}
+      ${PARSE_HEADERS}
+      ${PARSE_NOINSTALLHEADERS}
+      ${PARSE_SOURCES}
+      )
+  ELSE()
+    CUDA_ADD_LIBRARY(
+      ${LIBRARY_NAME}
+      ${PARSE_HEADERS}
+      ${PARSE_NOINSTALLHEADERS}
+      ${PARSE_SOURCES}
+      )
+  ENDIF()
+
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(${LIBRARY_NAME} ${LINK_LIBS})
+
+  IF (NOT PARSE_TESTONLY OR PARSE_NO_INSTALL_LIB_OR_HEADERS)
+
+    INSTALL(
+      TARGETS ${LIBRARY_NAME}
+      EXPORT ${PROJECT_NAME}
+      RUNTIME DESTINATION bin
+      LIBRARY DESTINATION lib
+      ARCHIVE DESTINATION lib
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+    INSTALL(
+      FILES  ${PARSE_HEADERS}
+      EXPORT ${PROJECT_NAME}
+      DESTINATION include
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+      INSTALL(
+      DIRECTORY  ${PARSE_HEADERS_INSTALL_SUBDIR}
+      EXPORT ${PROJECT_NAME}
+      DESTINATION include
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+  ENDIF()
+
+  IF (NOT PARSE_TESTONLY)
+    PREPEND_GLOBAL_SET(${PACKAGE_NAME}_LIBS ${LIBRARY_NAME})
+    REMOVE_GLOBAL_DUPLICATES(${PACKAGE_NAME}_LIBS)
+  ENDIF()
+
+ENDFUNCTION()
+
+FUNCTION(TRIBITS_ADD_EXECUTABLE EXE_NAME)
+
+  SET(options NOEXEPREFIX NOEXESUFFIX ADD_DIR_TO_NAME INSTALLABLE TESTONLY)
+  SET(oneValueArgs ADDED_EXE_TARGET_NAME_OUT)
+  SET(multiValueArgs SOURCES CATEGORIES HOST XHOST HOSTTYPE XHOSTTYPE DIRECTORY TESTONLYLIBS IMPORTEDLIBS DEPLIBS COMM LINKER_LANGUAGE TARGET_DEFINES DEFINES)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  IF (PARSE_TARGET_DEFINES)
+    TARGET_COMPILE_DEFINITIONS(${EXE_NAME} PUBLIC ${PARSE_TARGET_DEFINES})
+  ENDIF()
+
+  SET(LINK_LIBS PACKAGE_${PACKAGE_NAME})
+
+  IF (PARSE_TESTONLYLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_TESTONLYLIBS})
+  ENDIF()
+
+  IF (PARSE_IMPORTEDLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
+  ENDIF()
+
+  SET (EXE_SOURCES)
+  IF(PARSE_DIRECTORY)
+    FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
+      IF(IS_ABSOLUTE ${SOURCE_FILE})
+        SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
+      ELSE()
+        SET (EXE_SOURCES ${EXE_SOURCES} ${PARSE_DIRECTORY}/${SOURCE_FILE})
+      ENDIF()
+    ENDFOREACH( )
+  ELSE()
+    FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
+      SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
+    ENDFOREACH( )
+  ENDIF()
+
+  SET(EXE_BINARY_NAME ${EXE_NAME})
+  IF(DEFINED PACKAGE_NAME AND NOT PARSE_NOEXEPREFIX)
+    SET(EXE_BINARY_NAME ${PACKAGE_NAME}_${EXE_BINARY_NAME})
+  ENDIF()
+
+  # IF (PARSE_TESTONLY)
+  #   SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
+  # ELSE()
+  #   SET(EXCLUDE_FROM_ALL_KEYWORD)
+  # ENDIF()
+  ADD_EXECUTABLE(${EXE_BINARY_NAME} ${EXCLUDE_FROM_ALL_KEYWORD} ${EXE_SOURCES})
+
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(${EXE_BINARY_NAME} ${LINK_LIBS})
+
+  IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
+    SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${EXE_BINARY_NAME} PARENT_SCOPE)
+  ENDIF()
+
+  IF(PARSE_INSTALLABLE)
+    INSTALL(
+      TARGETS ${EXE_BINARY_NAME}
+      EXPORT ${PROJECT_NAME}
+        DESTINATION bin
+    )
+  ENDIF()
+ENDFUNCTION()
+
+ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
+
+FUNCTION(TRIBITS_ADD_TEST)
+ENDFUNCTION()
+FUNCTION(TRIBITS_TPL_TENTATIVELY_ENABLE)
+ENDFUNCTION()
+
+FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
+
+  SET(options STANDARD_PASS_OUTPUT WILL_FAIL)
+  SET(oneValueArgs PASS_REGULAR_EXPRESSION FAIL_REGULAR_EXPRESSION ENVIRONMENT TIMEOUT CATEGORIES ADDED_TESTS_NAMES_OUT ADDED_EXE_TARGET_NAME_OUT)
+  SET(multiValueArgs)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  TRIBITS_ADD_EXECUTABLE(${EXE_NAME} TESTONLY ADDED_EXE_TARGET_NAME_OUT TEST_NAME ${PARSE_UNPARSED_ARGUMENTS})
+
+  IF(WIN32)
+    ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${TEST_NAME}${CMAKE_EXECUTABLE_SUFFIX})
+  ELSE()
+    ADD_TEST(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
+  ENDIF()
+  ADD_DEPENDENCIES(check ${TEST_NAME})
+
+  IF(PARSE_FAIL_REGULAR_EXPRESSION)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${PARSE_FAIL_REGULAR_EXPRESSION})
+  ENDIF()
+
+  IF(PARSE_PASS_REGULAR_EXPRESSION)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${PARSE_PASS_REGULAR_EXPRESSION})
+  ENDIF()
+
+  IF(PARSE_WILL_FAIL)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${PARSE_WILL_FAIL})
+  ENDIF()
+
+  IF(PARSE_ADDED_TESTS_NAMES_OUT)
+    SET(${PARSE_ADDED_TESTS_NAMES_OUT} ${TEST_NAME} PARENT_SCOPE)
+  ENDIF()
+
+  IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
+    SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${TEST_NAME} PARENT_SCOPE)
+  ENDIF()
+
+ENDFUNCTION()
+
+MACRO(TIBITS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME)
+  ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME})
+  TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES})
+  TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS})
+ENDMACRO()
+
+FUNCTION(TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME)
+
+  SET(options MUST_FIND_ALL_LIBS MUST_FIND_ALL_HEADERS NO_PRINT_ENABLE_SUCCESS_FAIL)
+  SET(oneValueArgs)
+  SET(multiValueArgs REQUIRED_HEADERS REQUIRED_LIBS_NAMES)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE)
+  IF (PARSE_REQUIRED_LIBS_NAMES)
+    FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES})
+    IF(NOT TPL_${TPL_NAME}_LIBRARIES)
+      SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
+    ENDIF()
+  ENDIF()
+  IF (PARSE_REQUIRED_HEADERS)
+    FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS})
+    IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS)
+      SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
+    ENDIF()
+  ENDIF()
+
+
+  IF (_${TPL_NAME}_ENABLE_SUCCESS)
+    TIBITS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME})
+  ENDIF()
+
+ENDFUNCTION()
+
+#MACRO(TRIBITS_PROCESS_TPL_DEP_FILE TPL_FILE)
+#  GET_FILENAME_COMPONENT(TPL_NAME ${TPL_FILE} NAME_WE)
+#  INCLUDE("${TPL_FILE}")
+#  IF(TARGET TPL_LIB_${TPL_NAME})
+#    MESSAGE(STATUS "Found tpl library: ${TPL_NAME}")
+#    SET(TPL_ENABLE_${TPL_NAME} TRUE)
+#  ELSE()
+#    MESSAGE(STATUS "Tpl library not found: ${TPL_NAME}")
+#    SET(TPL_ENABLE_${TPL_NAME} FALSE)
+#  ENDIF()
+#ENDMACRO()
+
+MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE)
+  IF(TYPE STREQUAL "REQUIRED")
+    SET(REQUIRED TRUE)
+  ELSE()
+    SET(REQUIRED FALSE)
+  ENDIF()
+  IF(TARGET ${TARGET_NAME})
+    PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME})
+  ELSE()
+    IF(REQUIRED)
+      MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}")
+    ENDIF()
+  ENDIF()
+ENDMACRO()
+
+MACRO(TRIBITS_APPEND_PACKAGE_DEPS DEP_LIST TYPE)
+  FOREACH(DEP ${ARGN})
+    PREPEND_GLOBAL_SET(${DEP_LIST} PACKAGE_${DEP})
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_APPEND_TPLS_DEPS DEP_LIST TYPE)
+  FOREACH(DEP ${ARGN})
+    PREPEND_TARGET_SET(${DEP_LIST} TPL_LIB_${DEP} ${TYPE})
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_ENABLE_TPLS)
+  FOREACH(TPL ${ARGN})
+    IF(TARGET ${TPL})
+      GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} TRUE)
+    ELSE()
+      GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} FALSE)
+    ENDIF()
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_PACKAGE_DEFINE_DEPENDENCIES)
+
+  SET(options)
+  SET(oneValueArgs)
+  SET(multiValueArgs 
+    LIB_REQUIRED_PACKAGES
+    LIB_OPTIONAL_PACKAGES
+    TEST_REQUIRED_PACKAGES
+    TEST_OPTIONAL_PACKAGES
+    LIB_REQUIRED_TPLS
+    LIB_OPTIONAL_TPLS
+    TEST_REQUIRED_TPLS
+    TEST_OPTIONAL_TPLS
+    REGRESSION_EMAIL_LIST
+    SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
+  )
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  GLOBAL_SET(${PACKAGE_NAME}_DEPS "")
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_PACKAGES})
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_PACKAGES})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_TPLS})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_TPLS})
+
+  GLOBAL_SET(${PACKAGE_NAME}_TEST_DEPS "")
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_PACKAGES})
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_PACKAGES})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_TPLS})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_TPLS})
+
+  TRIBITS_ENABLE_TPLS(${PARSE_LIB_REQUIRED_TPLS} ${PARSE_LIB_OPTIONAL_TPLS} ${PARSE_TEST_REQUIRED_TPLS} ${PARSE_TEST_OPTIONAL_TPLS})
+
+ENDMACRO()
+
+MACRO(TRIBITS_SUBPACKAGE NAME)
+  SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME})
+  SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME})
+  STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
+  SET(${PACKAGE_NAME}_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+
+  ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME})
+
+  GLOBAL_SET(${PACKAGE_NAME}_LIBS "")
+
+  INCLUDE(${PACKAGE_SOURCE_DIR}/cmake/Dependencies.cmake)
+
+ENDMACRO(TRIBITS_SUBPACKAGE)
+
+MACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(PACKAGE_${PACKAGE_NAME} ${${PACKAGE_NAME}_LIBS})
+ENDMACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
+
+MACRO(TRIBITS_PACKAGE_DECL NAME)
+
+  SET(PACKAGE_NAME ${NAME})
+  SET(${PACKAGE_NAME}_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
+
+  #SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps")
+  #FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake")
+  #FOREACH(TPL_FILE ${TPLS_FILES})
+  #  TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE})
+  #ENDFOREACH()
+
+ENDMACRO()
+
+
+MACRO(TRIBITS_PROCESS_SUBPACKAGES)
+  FILE(GLOB SUBPACKAGES RELATIVE ${CMAKE_SOURCE_DIR} */cmake/Dependencies.cmake)
+  FOREACH(SUBPACKAGE ${SUBPACKAGES})
+    GET_FILENAME_COMPONENT(SUBPACKAGE_CMAKE ${SUBPACKAGE} DIRECTORY)
+    GET_FILENAME_COMPONENT(SUBPACKAGE_DIR ${SUBPACKAGE_CMAKE} DIRECTORY)
+    ADD_SUBDIRECTORY(${CMAKE_BINARY_DIR}/../${SUBPACKAGE_DIR})
+  ENDFOREACH()
+ENDMACRO(TRIBITS_PROCESS_SUBPACKAGES)
+
+MACRO(TRIBITS_PACKAGE_DEF)
+ENDMACRO(TRIBITS_PACKAGE_DEF)
+
+MACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
+ENDMACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
+
+MACRO(TRIBITS_EXCLUDE_FILES)
+ENDMACRO(TRIBITS_EXCLUDE_FILES)
+
+MACRO(TRIBITS_PACKAGE_POSTPROCESS)
+ENDMACRO(TRIBITS_PACKAGE_POSTPROCESS)
+
diff --git a/packages/kokkos/config/test_all_sandia b/packages/kokkos/config/test_all_sandia
new file mode 100755
index 0000000000000000000000000000000000000000..28b4a64b10819538c7f5fc672ada210671343cf4
--- /dev/null
+++ b/packages/kokkos/config/test_all_sandia
@@ -0,0 +1,777 @@
+#!/bin/bash -e
+
+#
+# Global config
+#
+
+set -o pipefail
+
+# Determine current machine.
+
+MACHINE=""
+HOSTNAME=$(hostname)
+PROCESSOR=`uname -p`
+
+if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
+  MACHINE=white
+  module load git
+fi
+
+if [[ "$HOSTNAME" =~ .*bowman.* ]]; then
+  MACHINE=bowman
+  module load git
+fi
+
+if [[ "$HOSTNAME" =~ n.* ]]; then # Warning: very generic name
+  if [[ "$PROCESSOR" = "aarch64" ]]; then
+    MACHINE=sullivan
+    module load git
+  fi
+fi
+
+if [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
+  if [[ "$MACHINE" = "" ]]; then
+    MACHINE=shepard
+    module load git
+  fi
+fi
+
+if [[ "$HOSTNAME" =~ apollo ]]; then
+  MACHINE=apollo
+  module load git
+fi
+
+if [[ "$HOSTNAME" =~ sullivan ]]; then
+  MACHINE=sullivan
+  module load git
+fi
+
+if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
+  if [[ "$MACHINE" = "" ]]; then
+    MACHINE=sems
+    module load sems-git
+  fi  
+fi
+
+if [[ "$MACHINE" = "" ]]; then
+  echo "Unrecognized machine" >&2
+  exit 1
+fi
+
+echo "Running on machine: $MACHINE"
+
+GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
+IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
+ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
+INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
+CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"
+CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial"
+
+GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
+IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
+CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
+INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
+#CUDA_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
+CUDA_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized"
+PGI_WARNING_FLAGS=""
+
+# Default. Machine specific can override.
+DEBUG=False
+ARGS=""
+CUSTOM_BUILD_LIST=""
+QTHREADS_PATH=""
+DRYRUN=False
+BUILD_ONLY=False
+declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
+TEST_SCRIPT=False
+SKIP_HWLOC=False
+SPOT_CHECK=False
+
+PRINT_HELP=False
+OPT_FLAG=""
+CXX_FLAGS_EXTRA=""
+LD_FLAGS_EXTRA=""
+KOKKOS_OPTIONS=""
+
+#
+# Handle arguments.
+#
+
+while [[ $# > 0 ]]
+do
+  key="$1"
+
+  case $key in
+    --kokkos-path*)
+      KOKKOS_PATH="${key#*=}"
+      ;;
+    --qthreads-path*)
+      QTHREADS_PATH="${key#*=}"
+      ;;
+    --build-list*)
+      CUSTOM_BUILD_LIST="${key#*=}"
+      ;;
+    --debug*)
+      DEBUG=True
+      ;;
+    --build-only*)
+      BUILD_ONLY=True
+      ;;
+    --test-script*)
+      TEST_SCRIPT=True
+      ;;
+    --skip-hwloc*)
+      SKIP_HWLOC=True
+      ;;
+    --num*)
+      NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
+      ;;
+    --dry-run*)
+      DRYRUN=True
+      ;;
+    --spot-check*)
+      SPOT_CHECK=True
+      ;;
+    --arch*)
+      ARCH_FLAG="--arch=${key#*=}"
+      ;;
+    --opt-flag*)
+      OPT_FLAG="${key#*=}"
+      ;;
+    --with-cuda-options*)
+      KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
+      ;;
+    --cxxflags-extra*)
+      CXX_FLAGS_EXTRA="${key#*=}"
+      ;;
+    --ldflags-extra*)
+      LD_FLAGS_EXTRA="${key#*=}"
+      ;;
+    --help*)
+      PRINT_HELP=True
+      ;;
+    *)
+      # args, just append
+      ARGS="$ARGS $1"
+      ;;
+  esac
+
+  shift
+done
+
+SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
+
+# Set kokkos path.
+if [ -z "$KOKKOS_PATH" ]; then
+  KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
+else
+  # Ensure KOKKOS_PATH is abs path.
+  KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
+fi
+
+UNCOMMITTED=`cd ${KOKKOS_PATH}; git status --porcelain 2>/dev/null`
+if ! [ -z "$UNCOMMITTED" ]; then
+  echo "WARNING!! THE FOLLOWING CHANGES ARE UNCOMMITTED!! :"
+  echo "$UNCOMMITTED"
+  echo ""
+fi
+
+GITSTATUS=`cd ${KOKKOS_PATH}; git log -n 1 --format=oneline`
+echo "Repository Status: " ${GITSTATUS}
+echo ""
+echo ""
+
+#
+# Machine specific config.
+#
+
+if [ "$MACHINE" = "sems" ]; then
+  source /projects/sems/modulefiles/utils/sems-modules-init.sh
+
+  BASE_MODULE_LIST="sems-env,kokkos-env,kokkos-hwloc/1.10.1/base,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
+  CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
+  CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG=""
+  fi
+
+  if [ "$SPOT_CHECK" = "True" ]; then
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS"
+               "gcc/6.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
+               "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
+               "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
+               "cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+    )
+  else
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+    )
+  fi
+elif [ "$MACHINE" = "white" ]; then
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=32
+
+  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+  IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
+  CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
+  CUDA_MODULE_LIST2="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/6.3.0,ibm/xl/13.1.6"
+
+  # Don't do pthread on white.
+  GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
+
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "ibm/13.1.6 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
+             "cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+             "cuda/9.0.103 $CUDA_MODULE_LIST2 $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+  )
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=Power8,Kepler37"
+  fi
+
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
+
+elif [ "$MACHINE" = "bowman" ]; then
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=32
+
+  BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+
+  OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/18.0.128 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+  )
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=KNL"
+  fi
+
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
+
+elif [ "$MACHINE" = "sullivan" ]; then
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=96
+
+  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("gcc/6.1.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS")
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=ARMv8-ThunderX"
+  fi
+
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
+
+elif [ "$MACHINE" = "shepard" ]; then
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=32
+
+  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+  BASE_MODULE_LIST_INTEL="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/18.0.128 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "pgi/17.10.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS"
+  )
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=HSW"
+  fi
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
+
+elif [ "$MACHINE" = "apollo" ]; then
+  source /projects/sems/modulefiles/utils/sems-modules-init.sh
+  module use /home/projects/modulefiles/local/x86-64
+  module load kokkos-env
+
+  module load sems-git
+  module load sems-tex
+  module load sems-cmake/3.5.2
+  module load sems-gdb
+
+  SKIP_HWLOC=True
+
+  BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
+  CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
+  CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
+
+  CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/9.0.69"
+  NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
+
+  BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP"
+  BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread"
+  BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
+
+  if [ "$SPOT_CHECK" = "True" ]; then
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
+               "gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
+               "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
+               "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
+               "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
+               "cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+    )
+  else
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
+               "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
+               "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+    )
+  fi
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=SNB,Volta70"
+  fi
+
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
+
+else
+  echo "Unhandled machine $MACHINE" >&2
+  exit 1
+fi
+
+export OMP_NUM_THREADS=4
+
+declare -i NUM_RESULTS_TO_KEEP=7
+
+RESULT_ROOT_PREFIX=TestAll
+
+if [ "$PRINT_HELP" = "True" ]; then
+  echo "test_all_sandia <ARGS> <OPTIONS>:"
+  echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
+  echo "    Defaults to root repo containing this script"
+  echo "--debug: Run tests in debug. Defaults to False"
+  echo "--test-script: Test this script, not Kokkos"
+  echo "--skip-hwloc: Do not do hwloc tests"
+  echo "--num=N: Number of jobs to run in parallel"
+  echo "--spot-check: Minimal test set to issue pull request"
+  echo "--dry-run: Just print what would be executed"
+  echo "--build-only: Just do builds, don't run anything"
+  echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
+  echo "--cxxflags-extra=FLAGS: Extra flags to be added to CXX_FLAGS"
+  echo "--ldflags-extra=FLAGS: Extra flags to be added to LD_FLAGS"
+  echo "--arch=ARCHITECTURE: overwrite architecture flags"
+  echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
+  echo "--build-list=BUILD,BUILD,BUILD..."
+  echo "    Provide a comma-separated list of builds instead of running all builds"
+  echo "    Valid items:"
+  echo "      OpenMP, Pthread, Qthreads, Serial, OpenMP_Serial, Pthread_Serial"
+  echo "      Qthreads_Serial, Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
+  echo ""
+
+  echo "ARGS: list of expressions matching compilers to test"
+  echo "  supported compilers sems"
+  for COMPILER_DATA in "${COMPILERS[@]}"; do
+    ARR=($COMPILER_DATA)
+    COMPILER=${ARR[0]}
+    echo "    $COMPILER"
+  done
+  echo ""
+
+  echo "Examples:"
+  echo "  Run all tests"
+  echo "  % test_all_sandia"
+  echo ""
+  echo "  Run all gcc tests"
+  echo "  % test_all_sandia gcc"
+  echo ""
+  echo "  Run all gcc/4.8.4 and all intel tests"
+  echo "  % test_all_sandia gcc/4.8.4 intel"
+  echo ""
+  echo "  Run all tests in debug"
+  echo "  % test_all_sandia --debug"
+  echo ""
+  echo "  Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds"
+  echo "  % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial"
+  echo ""
+  echo "If you want to kill the tests, do:"
+  echo "  hit ctrl-z"
+  echo "  % kill -9 %1"
+  echo
+  exit 0
+fi
+
+# Set build type.
+if [ "$DEBUG" = "True" ]; then
+  BUILD_TYPE=debug
+else
+  BUILD_TYPE=release
+fi
+
+# If no args provided, do all compilers.
+if [ -z "$ARGS" ]; then
+  ARGS='?'
+fi
+
+# Process args to figure out which compilers to test.
+COMPILERS_TO_TEST=""
+
+for ARG in $ARGS; do
+  for COMPILER_DATA in "${COMPILERS[@]}"; do
+    ARR=($COMPILER_DATA)
+    COMPILER=${ARR[0]}
+
+    if [[ "$COMPILER" = $ARG* ]]; then
+      if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
+        COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
+      else
+        echo "Tried to add $COMPILER twice"
+      fi
+    fi
+  done
+done
+
+# Check if Qthreads build requested.
+HAVE_QTHREADS_BUILD="False"
+if [ -n "$CUSTOM_BUILD_LIST" ]; then
+  if [[ "$CUSTOM_BUILD_LIST" = *Qthreads* ]]; then
+    HAVE_QTHREADS_BUILD="True"
+  fi
+else
+  for COMPILER_DATA in "${COMPILERS[@]}"; do
+    ARR=($COMPILER_DATA)
+    BUILD_LIST=${ARR[2]}
+    if [[ "$BUILD_LIST" = *Qthreads* ]]; then
+      HAVE_QTHREADS_BUILD="True"
+    fi
+  done
+fi
+
+# Ensure Qthreads path is set if Qthreads build is requested.
+if [ "$HAVE_QTHREADS_BUILD" = "True" ]; then
+  if [ -z "$QTHREADS_PATH" ]; then
+    echo "Need to supply Qthreads path (--qthreads-path) when testing Qthreads backend." >&2
+    exit 1
+  else
+    # Strip trailing slashes from path.
+    QTHREADS_PATH=$(echo $QTHREADS_PATH | sed 's/\/*$//')
+  fi
+fi
+
+#
+# Functions.
+#
+
+# get_compiler_name <COMPILER>
+get_compiler_name() {
+  echo $1 | cut -d/ -f1
+}
+
+# get_compiler_version <COMPILER>
+get_compiler_version() {
+  echo $1 | cut -d/ -f2
+}
+
+# Do not call directly.
+get_compiler_data() {
+  local compiler=$1
+  local item=$2
+  local compiler_name=$(get_compiler_name $compiler)
+  local compiler_vers=$(get_compiler_version $compiler)
+
+  local compiler_data
+  for compiler_data in "${COMPILERS[@]}" ; do
+    local arr=($compiler_data)
+
+    if [ "$compiler" = "${arr[0]}" ]; then
+      echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
+      return 0
+    fi
+  done
+
+  # Not found.
+  echo "Unreconized compiler $compiler" >&2
+  exit 1
+}
+
+#
+# For all getters, usage: <GETTER> <COMPILER>
+#
+
+get_compiler_modules() {
+  get_compiler_data $1 1
+}
+
+get_compiler_build_list() {
+  get_compiler_data $1 2
+}
+
+get_compiler_exe_name() {
+  get_compiler_data $1 3
+}
+
+get_compiler_warning_flags() {
+  get_compiler_data $1 4
+}
+
+run_cmd() {
+  echo "RUNNING: $*"
+  if [ "$DRYRUN" != "True" ]; then
+    eval "$* 2>&1"
+  fi
+}
+
+# report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
+report_and_log_test_result() {
+  # Use sane var names.
+  local success=$1; local desc=$2; local comment=$3;
+
+  if [ "$success" = "0" ]; then
+    echo "  PASSED $desc"
+    echo $comment > $PASSED_DIR/$desc
+  else
+    # For failures, comment should be the name of the phase that failed.
+    echo "  FAILED $desc" >&2
+    echo $comment > $FAILED_DIR/$desc
+    cat ${desc}.${comment}.log
+  fi
+}
+
+setup_env() {
+  local compiler=$1
+  local compiler_modules=$(get_compiler_modules $compiler)
+
+  module purge
+
+  local mod
+  for mod in $compiler_modules; do
+    echo "Loading module $mod"
+    module load $mod 2>&1
+    # It is ridiculously hard to check for the success of a loaded
+    # module. Module does not return error codes and piping to grep
+    # causes module to run in a subshell.
+    module list 2>&1 | grep "$mod" >& /dev/null || return 1
+  done
+
+  return 0
+}
+
+# single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE>
+single_build_and_test() {
+  # Use sane var names.
+  local compiler=$1; local build=$2; local build_type=$3;
+
+  # Set up env.
+  mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
+  cd $ROOT_DIR/$compiler/"${build}-$build_type"
+  local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
+  setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+
+  # Set up flags.
+  local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
+  local compiler_exe=$(get_compiler_exe_name $compiler)
+
+  if [[ "$build_type" = hwloc* ]]; then
+    local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
+  fi
+
+  if [[ "$build" = *Qthreads* ]]; then
+    if [[ "$build_type" = hwloc* ]]; then
+      local extra_args="$extra_args --qthreads-path=${QTHREADS_PATH}_hwloc"
+    else
+      local extra_args="$extra_args --qthreads-path=$QTHREADS_PATH"
+    fi
+  fi
+
+  if [[ "$OPT_FLAG" = "" ]]; then
+    OPT_FLAG="-O3"
+  fi
+
+  if [[ "$build_type" = *debug* ]]; then
+    local extra_args="$extra_args --debug"
+    local cxxflags="-g $compiler_warning_flags"
+    local ldflags="-g"
+  else
+    local cxxflags="$OPT_FLAG $compiler_warning_flags"
+    local ldflags="${OPT_FLAG}"
+  fi
+
+  local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}"
+  local ldflags="${ldflags} ${LD_FLAGS_EXTRA}"
+
+  if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
+    local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
+  fi
+
+  echo "  Starting job $desc"
+
+  local comment="no_comment"
+
+  if [ "$TEST_SCRIPT" = "True" ]; then
+    local rand=$[ 1 + $[ RANDOM % 10 ]]
+    sleep $rand
+
+    if [ $rand -gt 5 ]; then
+      run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
+    fi
+  else
+    run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+    local -i build_start_time=$(date +%s)
+    run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
+    local -i build_end_time=$(date +%s)
+    comment="build_time=$(($build_end_time-$build_start_time))"
+
+    if [[ "$BUILD_ONLY" == False ]]; then
+      run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
+      local -i run_end_time=$(date +%s)
+      comment="$comment run_time=$(($run_end_time-$build_end_time))"
+    fi
+  fi
+
+  report_and_log_test_result 0 $desc "$comment"
+
+  return 0
+}
+
+# wait_for_jobs <NUM-JOBS>
+wait_for_jobs() {
+  local -i max_jobs=$1
+  local -i num_active_jobs=$(jobs | wc -l)
+  while [ $num_active_jobs -ge $max_jobs ]
+  do
+    sleep 1
+    num_active_jobs=$(jobs | wc -l)
+    jobs >& /dev/null
+  done
+}
+
+# run_in_background <COMPILER> <BUILD> <BUILD_TYPE>
+run_in_background() {
+  local compiler=$1
+
+  local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
+  # Don't override command line input.
+  # if [[ "$BUILD_ONLY" == True ]]; then
+  #   num_jobs=8
+  # else
+    if [[ "$compiler" == cuda* ]]; then
+      num_jobs=1
+    fi
+  # fi
+  wait_for_jobs $num_jobs
+
+  single_build_and_test $* &
+}
+
+# build_and_test_all <COMPILER>
+build_and_test_all() {
+  # Get compiler data.
+  local compiler=$1
+  if [ -z "$CUSTOM_BUILD_LIST" ]; then
+    local compiler_build_list=$(get_compiler_build_list $compiler)
+  else
+    local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
+  fi
+
+  # Do builds.
+  local build
+  for build in $compiler_build_list
+  do
+    run_in_background $compiler $build $BUILD_TYPE
+
+    # If not cuda, do a hwloc test too.
+    if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
+      run_in_background $compiler $build "hwloc-$BUILD_TYPE"
+    fi
+  done
+
+  return 0
+}
+
+get_test_root_dir() {
+  local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort)
+  local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l)
+  local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP}
+
+  if [ $num_to_delete -gt 0 ]; then
+    /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete)
+  fi
+
+  echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S")
+}
+
+wait_summarize_and_exit() {
+  wait_for_jobs 1
+
+  echo "#######################################################"
+  echo "PASSED TESTS"
+  echo "#######################################################"
+
+  local passed_test
+  for passed_test in $(\ls -1 $PASSED_DIR | sort)
+  do
+    echo $passed_test $(cat $PASSED_DIR/$passed_test)
+  done
+
+  local -i rv=0
+  if [ "$(ls -A $FAILED_DIR)" ]; then
+    echo "#######################################################"
+    echo "FAILED TESTS"
+    echo "#######################################################"
+
+    local failed_test
+    for failed_test in $(\ls -1 $FAILED_DIR | sort)
+    do
+      echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
+      rv=$rv+1
+    done
+  fi
+
+  exit $rv
+}
+
+#
+# Main.
+#
+
+ROOT_DIR=$(get_test_root_dir)
+mkdir -p $ROOT_DIR
+cd $ROOT_DIR
+
+PASSED_DIR=$ROOT_DIR/results/passed
+FAILED_DIR=$ROOT_DIR/results/failed
+mkdir -p $PASSED_DIR
+mkdir -p $FAILED_DIR
+
+echo "Going to test compilers: " $COMPILERS_TO_TEST
+for COMPILER in $COMPILERS_TO_TEST; do
+  echo "Testing compiler $COMPILER"
+  build_and_test_all $COMPILER
+done
+
+wait_summarize_and_exit
diff --git a/packages/kokkos/containers/CMakeLists.txt b/packages/kokkos/containers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c37aa3e3e21e22ef1a1e8160d885d91884d9a62d
--- /dev/null
+++ b/packages/kokkos/containers/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+
+TRIBITS_SUBPACKAGE(Containers)
+
+
+IF(KOKKOS_HAS_TRILINOS)
+  ADD_SUBDIRECTORY(src)
+ENDIF()
+
+TRIBITS_ADD_TEST_DIRECTORIES(unit_tests)
+TRIBITS_ADD_TEST_DIRECTORIES(performance_tests)
+
+TRIBITS_SUBPACKAGE_POSTPROCESS()
diff --git a/packages/kokkos/containers/cmake/Dependencies.cmake b/packages/kokkos/containers/cmake/Dependencies.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..1d71d8af341181f689a6a8bf63036b67584cb138
--- /dev/null
+++ b/packages/kokkos/containers/cmake/Dependencies.cmake
@@ -0,0 +1,5 @@
+TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
+  LIB_REQUIRED_PACKAGES KokkosCore
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
+  TEST_OPTIONAL_TPLS CUSPARSE
+  )
diff --git a/packages/kokkos/containers/cmake/KokkosContainers_config.h.in b/packages/kokkos/containers/cmake/KokkosContainers_config.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..d91fdda1e353eddb2088ff86327e142676c9a6c9
--- /dev/null
+++ b/packages/kokkos/containers/cmake/KokkosContainers_config.h.in
@@ -0,0 +1,4 @@
+#ifndef KOKKOS_CONTAINERS_CONFIG_H
+#define KOKKOS_CONTAINERS_CONFIG_H
+
+#endif
diff --git a/packages/kokkos/containers/performance_tests/CMakeLists.txt b/packages/kokkos/containers/performance_tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1203a8bd81eab6a6e50f53c2eb8920d8e3c6159b
--- /dev/null
+++ b/packages/kokkos/containers/performance_tests/CMakeLists.txt
@@ -0,0 +1,45 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
+
+IF(NOT KOKKOS_HAS_TRILINOS)
+  IF(KOKKOS_SEPARATE_LIBS)
+    set(TEST_LINK_TARGETS kokkoscore)
+  ELSE()
+    set(TEST_LINK_TARGETS kokkos)
+  ENDIF()
+ENDIF()
+
+SET(SOURCES
+  TestMain.cpp 
+  TestCuda.cpp
+  )
+
+IF(Kokkos_ENABLE_Pthread)
+  LIST( APPEND SOURCES TestThreads.cpp)
+ENDIF()
+
+IF(Kokkos_ENABLE_OpenMP)
+  LIST( APPEND SOURCES TestOpenMP.cpp)
+ENDIF()
+
+# Per #374, we always want to build this test, but we only want to run
+# it as a PERFORMANCE test.  That's why we separate building the test
+# from running the test.
+
+TRIBITS_ADD_EXECUTABLE(
+  PerfTestExec
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+
+TRIBITS_ADD_TEST(
+  PerformanceTest
+  NAME PerfTestExec
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  CATEGORIES PERFORMANCE
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  )
diff --git a/packages/kokkos/containers/performance_tests/Makefile b/packages/kokkos/containers/performance_tests/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..ebed75ccd611f73899d9285e2243d6f574bf37a1
--- /dev/null
+++ b/packages/kokkos/containers/performance_tests/Makefile
@@ -0,0 +1,89 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/containers/performance_tests
+
+default: build_all
+	echo "End Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+else
+  CXX = g++
+endif
+
+CXXFLAGS = -O3
+LINK ?= $(CXX)
+LDFLAGS ?=
+override LDFLAGS += -lpthread
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/performance_tests
+
+TEST_TARGETS =
+TARGETS =
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = TestCuda.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_Cuda
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+	OBJ_ROCM = TestROCm.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_ROCm
+	TEST_TARGETS += test-rocm
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	OBJ_THREADS = TestThreads.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_Threads
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = TestOpenMP.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_OpenMP
+	TEST_TARGETS += test-openmp
+endif
+
+KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda
+
+KokkosContainers_PerformanceTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_ROCm
+
+KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads
+
+KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_OpenMP
+
+test-cuda: KokkosContainers_PerformanceTest_Cuda
+	./KokkosContainers_PerformanceTest_Cuda
+
+test-rocm: KokkosContainers_PerformanceTest_ROCm
+	./KokkosContainers_PerformanceTest_ROCm
+
+test-threads: KokkosContainers_PerformanceTest_Threads
+	./KokkosContainers_PerformanceTest_Threads
+
+test-openmp: KokkosContainers_PerformanceTest_OpenMP
+	./KokkosContainers_PerformanceTest_OpenMP
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
diff --git a/packages/kokkos/containers/performance_tests/TestCuda.cpp b/packages/kokkos/containers/performance_tests/TestCuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..682f3f52f70a6d7c37ef1006267d3bddffc9cd70
--- /dev/null
+++ b/packages/kokkos/containers/performance_tests/TestCuda.cpp
@@ -0,0 +1,111 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_CUDA )
+
+#include <cstdint>
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <fstream>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestDynRankView.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <TestGlobal2LocalIds.hpp>
+
+#include <TestUnorderedMapPerformance.hpp>
+
+namespace Performance {
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+TEST_F( cuda, dynrankview_perf )
+{
+  std::cout << "Cuda" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::Cuda>( 40960 );
+}
+
+TEST_F( cuda, global_2_local)
+{
+  std::cout << "Cuda" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::Cuda>(i);
+}
+
+TEST_F( cuda, unordered_map_performance_near)
+{
+  Perf::run_performance_tests<Kokkos::Cuda,true>("cuda-near");
+}
+
+TEST_F( cuda, unordered_map_performance_far)
+{
+  Perf::run_performance_tests<Kokkos::Cuda,false>("cuda-far");
+}
+
+}
+#else
+void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTCUDA_PREVENT_EMPTY_LINK_ERROR() {}
+#endif  /* #if defined( KOKKOS_ENABLE_CUDA ) */
diff --git a/packages/kokkos/containers/performance_tests/TestDynRankView.hpp b/packages/kokkos/containers/performance_tests/TestDynRankView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d2fae32a38d7d0ffe11e2790d73d45216da98de
--- /dev/null
+++ b/packages/kokkos/containers/performance_tests/TestDynRankView.hpp
@@ -0,0 +1,266 @@
+
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_DYNRANKVIEW_HPP
+#define KOKKOS_TEST_DYNRANKVIEW_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynRankView.hpp>
+#include <vector>
+
+#include <impl/Kokkos_Timer.hpp>
+
+// Compare performance of DynRankView to View, specific focus on the parenthesis operators
+
+namespace Performance {
+
+//View functor
+template <typename DeviceType>
+struct InitViewFunctor {
+  typedef Kokkos::View<double***, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.extent(1); ++j) {
+      for (unsigned k = 0; k < _inview.extent(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+  struct SumComputationTest
+  {
+    typedef Kokkos::View<double***, DeviceType> inviewtype;
+    inviewtype _inview;
+
+    typedef Kokkos::View<double*, DeviceType> outviewtype;
+    outviewtype _outview;
+
+    KOKKOS_INLINE_FUNCTION
+    SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int i) const {
+      for (unsigned j = 0; j < _inview.extent(1); ++j) {
+        for (unsigned k = 0; k < _inview.extent(2); ++k) {
+          _outview(i) += _inview(i,j,k) ;
+        }
+      }
+    }
+  };
+
+};
+
+template <typename DeviceType>
+struct InitStrideViewFunctor {
+  typedef Kokkos::View<double***, Kokkos::LayoutStride, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitStrideViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.extent(1); ++j) {
+      for (unsigned k = 0; k < _inview.extent(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+};
+
+template <typename DeviceType>
+struct InitViewRank7Functor {
+  typedef Kokkos::View<double*******, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitViewRank7Functor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.extent(1); ++j) {
+      for (unsigned k = 0; k < _inview.extent(2); ++k) {
+        _inview(i,j,k,0,0,0,0) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+};
+
+//DynRankView functor
+template <typename DeviceType>
+struct InitDynRankViewFunctor {
+  typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitDynRankViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.extent(1); ++j) {
+      for (unsigned k = 0; k < _inview.extent(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+  struct SumComputationTest
+  {
+    typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
+    inviewtype _inview;
+
+    typedef Kokkos::DynRankView<double, DeviceType> outviewtype;
+    outviewtype _outview;
+
+    KOKKOS_INLINE_FUNCTION
+    SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int i) const {
+      for (unsigned j = 0; j < _inview.extent(1); ++j) {
+        for (unsigned k = 0; k < _inview.extent(2); ++k) {
+          _outview(i) += _inview(i,j,k) ;
+        }
+      }
+    }
+  };
+
+};
+
+
+template <typename DeviceType>
+void test_dynrankview_op_perf( const int par_size )
+{
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
+  const size_type dim_2 = 90;
+  const size_type dim_3 = 30;
+
+  double elapsed_time_view = 0;
+  double elapsed_time_compview = 0;
+  double elapsed_time_strideview = 0;
+  double elapsed_time_view_rank7 = 0;
+  double elapsed_time_drview = 0;
+  double elapsed_time_compdrview = 0;
+  Kokkos::Timer timer;
+  {
+    Kokkos::View<double***,DeviceType> testview("testview",par_size,dim_2,dim_3);
+    typedef InitViewFunctor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testview) );
+    DeviceType::fence();
+    elapsed_time_view = timer.seconds();
+    std::cout << " View time (init only): " << elapsed_time_view << std::endl;
+
+
+    timer.reset();
+    Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
+    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
+    DeviceType::fence();
+    elapsed_time_compview = timer.seconds();
+    std::cout << " View sum computation time: " << elapsed_time_view << std::endl;
+
+
+    Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL);
+    typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;
+
+    timer.reset();
+    Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
+    DeviceType::fence();
+    elapsed_time_strideview = timer.seconds();
+    std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
+  }
+  {
+    Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim_2,dim_3,1,1,1,1);
+    typedef InitViewRank7Functor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testview) );
+    DeviceType::fence();
+    elapsed_time_view_rank7 = timer.seconds();
+    std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
+  }
+  {
+    Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim_2,dim_3);
+    typedef InitDynRankViewFunctor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testdrview) );
+    DeviceType::fence();
+    elapsed_time_drview = timer.seconds();
+    std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;
+
+    timer.reset();
+    Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
+    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
+    DeviceType::fence();
+    elapsed_time_compdrview = timer.seconds();
+    std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;
+
+  }
+
+  std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1
+  std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1
+  std::cout << " Ratio of View to View Rank7  time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1
+  std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1
+  std::cout << " Ratio of DynRankView to View Rank7  time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ?
+
+  timer.reset();
+
+} //end test_dynrankview
+
+
+} //end Performance
+#endif
+
diff --git a/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..dcaca776be59b61b56ad82e78d6f715263e6082f
--- /dev/null
+++ b/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@@ -0,0 +1,231 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP
+#define KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <vector>
+#include <algorithm>
+
+#include <impl/Kokkos_Timer.hpp>
+
+// This test will simulate global ids
+
+namespace Performance {
+
+static const unsigned begin_id_size = 256u;
+static const unsigned end_id_size = 1u << 22;
+static const unsigned id_step = 2u;
+
+union helper
+{
+  uint32_t word;
+  uint8_t byte[4];
+};
+
+
+template <typename Device>
+struct generate_ids
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
+
+  local_id_view local_2_global;
+
+  generate_ids( local_id_view & ids)
+    : local_2_global(ids)
+  {
+    Kokkos::parallel_for(local_2_global.extent(0), *this);
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+
+    helper x = {static_cast<uint32_t>(i)};
+
+    // shuffle the bytes of i to create a unique, semi-random global_id
+    x.word = ~x.word;
+
+    uint8_t tmp = x.byte[3];
+    x.byte[3] = x.byte[1];
+    x.byte[1] = tmp;
+
+    tmp = x.byte[2];
+    x.byte[2] = x.byte[0];
+    x.byte[0] = tmp;
+
+    local_2_global[i] = x.word;
+  }
+
+};
+
+template <typename Device>
+struct fill_map
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
+  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
+
+  global_id_view global_2_local;
+  local_id_view local_2_global;
+
+  fill_map( global_id_view gIds, local_id_view lIds)
+    : global_2_local(gIds) , local_2_global(lIds)
+  {
+    Kokkos::parallel_for(local_2_global.extent(0), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+    global_2_local.insert( local_2_global[i], i);
+  }
+
+};
+
+template <typename Device>
+struct find_test
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
+  typedef Kokkos::UnorderedMap<const uint32_t, const size_type,execution_space> global_id_view;
+
+  global_id_view global_2_local;
+  local_id_view local_2_global;
+
+  typedef size_t value_type;
+
+  find_test( global_id_view gIds, local_id_view lIds, value_type & num_errors)
+    : global_2_local(gIds) , local_2_global(lIds)
+  {
+    Kokkos::parallel_reduce(local_2_global.extent(0), *this, num_errors);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type & v) const
+  { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type & dst, volatile value_type const & src) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i, value_type & num_errors) const
+  {
+    uint32_t index = global_2_local.find( local_2_global[i] );
+
+    if ( global_2_local.value_at(index) != i) ++num_errors;
+  }
+
+};
+
+template <typename Device>
+void test_global_to_local_ids(unsigned num_ids)
+{
+
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+
+  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
+  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
+
+  //size
+  std::cout << num_ids << ", ";
+
+  double elasped_time = 0;
+  Kokkos::Timer timer;
+
+  local_id_view local_2_global("local_ids", num_ids);
+  global_id_view global_2_local((3u*num_ids)/2u);
+
+  //create
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << ", ";
+  timer.reset();
+
+  // generate unique ids
+  {
+    generate_ids<Device> gen(local_2_global);
+  }
+  Device::fence();
+  // generate
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << ", ";
+  timer.reset();
+
+  {
+    fill_map<Device> fill(global_2_local, local_2_global);
+  }
+  Device::fence();
+
+  // fill
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << ", ";
+  timer.reset();
+
+
+  size_t num_errors = 0;
+  for (int i=0; i<100; ++i)
+  {
+    find_test<Device> find(global_2_local, local_2_global,num_errors);
+  }
+  Device::fence();
+
+  // find
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << std::endl;
+
+  ASSERT_EQ( num_errors, 0u);
+}
+
+
+} // namespace Performance
+
+
+#endif //KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP
+
diff --git a/packages/kokkos/containers/performance_tests/TestMain.cpp b/packages/kokkos/containers/performance_tests/TestMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..217b01a57afccf1bc7658ce41214d77fbffffd67
--- /dev/null
+++ b/packages/kokkos/containers/performance_tests/TestMain.cpp
@@ -0,0 +1,53 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <cstdlib>
+
+#include <Kokkos_Macros.hpp>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/packages/kokkos/containers/performance_tests/TestOpenMP.cpp b/packages/kokkos/containers/performance_tests/TestOpenMP.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66d497552e3de2dc09cd409bed204c35bc0ddb68
--- /dev/null
+++ b/packages/kokkos/containers/performance_tests/TestOpenMP.cpp
@@ -0,0 +1,142 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP )
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <TestGlobal2LocalIds.hpp>
+#include <TestUnorderedMapPerformance.hpp>
+
+#include <TestDynRankView.hpp>
+#include <TestScatterView.hpp>
+
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <fstream>
+
+
+namespace Performance {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    Kokkos::OpenMP::initialize();
+    Kokkos::OpenMP::print_configuration( std::cout );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::OpenMP::finalize();
+  }
+};
+
+TEST_F( openmp, dynrankview_perf )
+{
+  std::cout << "OpenMP" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::OpenMP>( 8192 );
+}
+
+TEST_F( openmp, global_2_local)
+{
+  std::cout << "OpenMP" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::OpenMP>(i);
+}
+
+TEST_F( openmp, unordered_map_performance_near)
+{
+  unsigned num_openmp = 4;
+  if (Kokkos::hwloc::available()) {
+    num_openmp = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "openmp-" << num_openmp << "-near";
+  Perf::run_performance_tests<Kokkos::OpenMP,true>(base_file_name.str());
+}
+
+TEST_F( openmp, unordered_map_performance_far)
+{
+  unsigned num_openmp = 4;
+  if (Kokkos::hwloc::available()) {
+    num_openmp = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "openmp-" << num_openmp << "-far";
+  Perf::run_performance_tests<Kokkos::OpenMP,false>(base_file_name.str());
+}
+
+TEST_F( openmp, scatter_view)
+{
+  std::cout << "ScatterView data-duplicated test:\n";
+  Perf::test_scatter_view<Kokkos::OpenMP, Kokkos::LayoutRight,
+    Kokkos::Experimental::ScatterDuplicated,
+    Kokkos::Experimental::ScatterNonAtomic>(10, 1000 * 1000);
+//std::cout << "ScatterView atomics test:\n";
+//Perf::test_scatter_view<Kokkos::OpenMP, Kokkos::LayoutRight,
+//  Kokkos::Experimental::ScatterNonDuplicated,
+//  Kokkos::Experimental::ScatterAtomic>(10, 1000 * 1000);
+}
+
+} // namespace test
+#else
+void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTOPENMP_PREVENT_EMPTY_LINK_ERROR() {}
+#endif
+
diff --git a/packages/kokkos/containers/performance_tests/TestROCm.cpp b/packages/kokkos/containers/performance_tests/TestROCm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3cf9f3bd143c5848af12e7ddb717ab8e4d59ce8e
--- /dev/null
+++ b/packages/kokkos/containers/performance_tests/TestROCm.cpp
@@ -0,0 +1,113 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_ROCM )
+
+#include <cstdint>
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <fstream>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestDynRankView.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <TestGlobal2LocalIds.hpp>
+
+#include <TestUnorderedMapPerformance.hpp>
+
+namespace Performance {
+
+class rocm : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Experimental::ROCm::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+#if 0
+// issue 1089
+TEST_F( rocm, dynrankview_perf )
+{
+  std::cout << "ROCm" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::Experimental::ROCm>( 40960 );
+}
+
+TEST_F( rocm, global_2_local)
+{
+  std::cout << "ROCm" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::Experimental::ROCm>(i);
+}
+
+#endif
+TEST_F( rocm, unordered_map_performance_near)
+{
+  Perf::run_performance_tests<Kokkos::Experimental::ROCm,true>("rocm-near");
+}
+
+TEST_F( rocm, unordered_map_performance_far)
+{
+  Perf::run_performance_tests<Kokkos::Experimental::ROCm,false>("rocm-far");
+}
+
+}
+#else
+void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTROCM_PREVENT_EMPTY_LINK_ERROR() {}
+#endif  /* #if defined( KOKKOS_ENABLE_ROCM ) */
diff --git a/packages/kokkos/containers/performance_tests/TestScatterView.hpp b/packages/kokkos/containers/performance_tests/TestScatterView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..03129d2b09f17b03f34f8cb8a9b4442d9c570709
--- /dev/null
+++ b/packages/kokkos/containers/performance_tests/TestScatterView.hpp
@@ -0,0 +1,113 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_SCATTER_VIEW_HPP
+#define KOKKOS_TEST_SCATTER_VIEW_HPP
+
+#include <Kokkos_ScatterView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+namespace Perf {
+
+template <typename ExecSpace, typename Layout, int duplication, int contribution>
+void test_scatter_view(int m, int n)
+{
+  Kokkos::View<double *[3], Layout, ExecSpace> original_view("original_view", n);
+  {
+    auto scatter_view = Kokkos::Experimental::create_scatter_view
+      < Kokkos::Experimental::ScatterSum
+      , duplication
+      , contribution
+      > (original_view);
+    Kokkos::Experimental::UniqueToken<
+      ExecSpace, Kokkos::Experimental::UniqueTokenScope::Global>
+      unique_token{ExecSpace()};
+  //auto internal_view = scatter_view.internal_view;
+    auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
+    for (int foo = 0; foo < 5; ++foo) {
+    {
+      auto num_threads = unique_token.size();
+      std::cout << "num_threads " << num_threads << '\n';
+      Kokkos::View<double **[3], Layout, ExecSpace> hand_coded_duplicate_view("hand_coded_duplicate", num_threads, n);
+      auto f2 = KOKKOS_LAMBDA(int i) {
+        auto thread_id = unique_token.acquire();
+        for (int j = 0; j < 10; ++j) {
+          auto k = (i + j) % n;
+          hand_coded_duplicate_view(thread_id, k, 0) += 4.2;
+          hand_coded_duplicate_view(thread_id, k, 1) += 2.0;
+          hand_coded_duplicate_view(thread_id, k, 2) += 1.0;
+        }
+      };
+      Kokkos::Timer timer;
+      timer.reset();
+      for (int k = 0; k < m; ++k) {
+        Kokkos::parallel_for(policy, f2, "hand_coded_duplicate_scatter_view_test");
+      }
+      auto t = timer.seconds();
+      std::cout << "hand-coded test took " << t << " seconds\n";
+    }
+    {
+      auto f = KOKKOS_LAMBDA(int i) {
+        auto scatter_access = scatter_view.access();
+        for (int j = 0; j < 10; ++j) {
+          auto k = (i + j) % n;
+          scatter_access(k, 0) += 4.2;
+          scatter_access(k, 1) += 2.0;
+          scatter_access(k, 2) += 1.0;
+        }
+      };
+      Kokkos::Timer timer;
+      timer.reset();
+      for (int k = 0; k < m; ++k) {
+        Kokkos::parallel_for(policy, f, "scatter_view_test");
+      }
+      auto t = timer.seconds();
+      std::cout << "test took " << t << " seconds\n";
+    }
+  }
+  }
+}
+
+}
+
+#endif
diff --git a/packages/kokkos/containers/performance_tests/TestThreads.cpp b/packages/kokkos/containers/performance_tests/TestThreads.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a951a5ca56e4f30fcbe39d0ae797f6bd0d126882
--- /dev/null
+++ b/packages/kokkos/containers/performance_tests/TestThreads.cpp
@@ -0,0 +1,141 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_THREADS )
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <iomanip>
+
+#include <TestGlobal2LocalIds.hpp>
+#include <TestUnorderedMapPerformance.hpp>
+
+#include <TestDynRankView.hpp>
+
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <fstream>
+
+namespace Performance {
+
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned num_threads = 4;
+
+    if (Kokkos::hwloc::available()) {
+      num_threads = Kokkos::hwloc::get_available_numa_count() *
+                    Kokkos::hwloc::get_available_cores_per_numa() *
+                    Kokkos::hwloc::get_available_threads_per_core();
+
+    }
+
+    std::cout << "Threads: " << num_threads << std::endl;
+
+    Kokkos::Threads::initialize( num_threads );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Threads::finalize();
+  }
+};
+
+TEST_F( threads, dynrankview_perf )
+{
+  std::cout << "Threads" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::Threads>( 8192 );
+}
+
+TEST_F( threads, global_2_local)
+{
+  std::cout << "Threads" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::Threads>(i);
+}
+
+TEST_F( threads, unordered_map_performance_near)
+{
+  unsigned num_threads = 4;
+  if (Kokkos::hwloc::available()) {
+    num_threads = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "threads-" << num_threads << "-near";
+  Perf::run_performance_tests<Kokkos::Threads,true>(base_file_name.str());
+}
+
+TEST_F( threads, unordered_map_performance_far)
+{
+  unsigned num_threads = 4;
+  if (Kokkos::hwloc::available()) {
+    num_threads = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "threads-" << num_threads << "-far";
+  Perf::run_performance_tests<Kokkos::Threads,false>(base_file_name.str());
+}
+
+} // namespace Performance
+
+#else
+void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTTHREADS_PREVENT_EMPTY_LINK_ERROR() {}
+#endif
+
diff --git a/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8734b259de911489e97ccce7bcfce6e11f3b387
--- /dev/null
+++ b/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@@ -0,0 +1,260 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
+#define KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <string>
+#include <sstream>
+
+
+namespace Perf {
+
+template <typename Device, bool Near>
+struct UnorderedMapTest
+{
+  typedef Device execution_space;
+  typedef Kokkos::UnorderedMap<uint32_t, uint32_t, execution_space> map_type;
+  typedef typename map_type::histogram_type histogram_type;
+
+  struct value_type {
+    uint32_t failed_count;
+    uint32_t max_list;
+  };
+
+  uint32_t capacity;
+  uint32_t inserts;
+  uint32_t collisions;
+  double   seconds;
+  map_type map;
+  histogram_type histogram;
+
+  UnorderedMapTest( uint32_t arg_capacity, uint32_t arg_inserts, uint32_t arg_collisions)
+    : capacity(arg_capacity)
+    , inserts(arg_inserts)
+    , collisions(arg_collisions)
+    , seconds(0)
+    , map(capacity)
+    , histogram(map.get_histogram())
+  {
+    Kokkos::Timer wall_clock ;
+    wall_clock.reset();
+
+    value_type v = {};
+    int loop_count = 0;
+    do {
+      ++loop_count;
+
+      v = value_type();
+      Kokkos::parallel_reduce(inserts, *this, v);
+
+      if (v.failed_count > 0u) {
+        const uint32_t new_capacity = map.capacity() + ((map.capacity()*3ull)/20u) + v.failed_count/collisions ;
+        map.rehash( new_capacity );
+      }
+    } while (v.failed_count > 0u);
+
+    seconds = wall_clock.seconds();
+
+    switch (loop_count)
+    {
+    case 1u: std::cout << " \033[0;32m" << loop_count << "\033[0m "; break;
+    case 2u: std::cout << " \033[1;31m" << loop_count << "\033[0m "; break;
+    default: std::cout << " \033[0;31m" << loop_count << "\033[0m "; break;
+    }
+    std::cout << std::setprecision(2) << std::fixed << std::setw(5) << (1e9*(seconds/(inserts))) << "; " << std::flush;
+
+    histogram.calculate();
+    Device::fence();
+  }
+
+  void print(std::ostream & metrics_out, std::ostream & length_out, std::ostream & distance_out, std::ostream & block_distance_out)
+  {
+    metrics_out << map.capacity() << " , ";
+    metrics_out << inserts/collisions << " , ";
+    metrics_out << (100.0 * inserts/collisions) / map.capacity() << " , ";
+    metrics_out << inserts << " , ";
+    metrics_out << (map.failed_insert() ? "true" : "false") << " , ";
+    metrics_out << collisions << " , ";
+    metrics_out << 1e9*(seconds/inserts) << " , ";
+    metrics_out << seconds << std::endl;
+
+    length_out << map.capacity() << " , ";
+    length_out << ((100.0 *inserts/collisions) / map.capacity()) << " , ";
+    length_out << collisions << " , ";
+    histogram.print_length(length_out);
+
+    distance_out << map.capacity() << " , ";
+    distance_out << ((100.0 *inserts/collisions) / map.capacity()) << " , ";
+    distance_out << collisions << " , ";
+    histogram.print_distance(distance_out);
+
+    block_distance_out << map.capacity() << " , ";
+    block_distance_out << ((100.0 *inserts/collisions) / map.capacity()) << " , ";
+    block_distance_out << collisions << " , ";
+    histogram.print_block_distance(block_distance_out);
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const
+  {
+    v.failed_count = 0;
+    v.max_list = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  {
+    dst.failed_count += src.failed_count;
+    dst.max_list = src.max_list < dst.max_list ? dst.max_list : src.max_list;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    const uint32_t key = Near ? i/collisions : i%(inserts/collisions);
+    typename map_type::insert_result result = map.insert(key,i);
+    v.failed_count += !result.failed() ? 0 : 1;
+    v.max_list = result.list_position() < v.max_list ? v.max_list : result.list_position();
+  }
+
+};
+
+template <typename Device, bool Near>
+void run_performance_tests(std::string const & base_file_name)
+{
+#if 0
+  std::string metrics_file_name = base_file_name + std::string("-metrics.csv");
+  std::string length_file_name = base_file_name  + std::string("-length.csv");
+  std::string distance_file_name = base_file_name + std::string("-distance.csv");
+  std::string block_distance_file_name = base_file_name + std::string("-block_distance.csv");
+
+  std::ofstream metrics_out( metrics_file_name.c_str(), std::ofstream::out );
+  std::ofstream length_out( length_file_name.c_str(), std::ofstream::out );
+  std::ofstream distance_out( distance_file_name.c_str(), std::ofstream::out );
+  std::ofstream block_distance_out( block_distance_file_name.c_str(), std::ofstream::out );
+
+
+  /*
+  const double test_ratios[] = {
+     0.50
+   , 0.75
+   , 0.80
+   , 0.85
+   , 0.90
+   , 0.95
+   , 1.00
+   , 1.25
+   , 2.00
+  };
+  */
+
+  const double test_ratios[] = { 1.00 };
+
+  const int num_ratios = sizeof(test_ratios) / sizeof(double);
+
+  /*
+  const uint32_t collisions[] {
+      1
+    , 4
+    , 16
+    , 64
+  };
+  */
+
+  const uint32_t collisions[] = { 16 };
+
+  const int num_collisions = sizeof(collisions) / sizeof(uint32_t);
+
+  // set up file headers
+  metrics_out << "Capacity , Unique , Percent Full , Attempted Inserts , Failed Inserts , Collision Ratio , Nanoseconds/Inserts, Seconds" << std::endl;
+  length_out << "Capacity , Percent Full , ";
+  distance_out << "Capacity , Percent Full , ";
+  block_distance_out << "Capacity , Percent Full , ";
+
+  for (int i=0; i<100; ++i) {
+    length_out << i << " , ";
+    distance_out << i << " , ";
+    block_distance_out << i << " , ";
+  }
+
+  length_out << "\b\b\b   " << std::endl;
+  distance_out << "\b\b\b   " << std::endl;
+  block_distance_out << "\b\b\b   " << std::endl;
+
+  Kokkos::Timer wall_clock ;
+  for (int i=0;  i < num_collisions ; ++i) {
+    wall_clock.reset();
+    std::cout << "Collisions: " << collisions[i] << std::endl;
+    for (int j = 0; j < num_ratios; ++j) {
+      std::cout << std::setprecision(1) << std::fixed << std::setw(5) << (100.0*test_ratios[j]) << "%  " << std::flush;
+      for (uint32_t capacity = 1<<14; capacity < 1<<25; capacity = capacity << 1) {
+        uint32_t inserts = static_cast<uint32_t>(test_ratios[j]*(capacity));
+        std::cout << capacity << std::flush;
+        UnorderedMapTest<Device, Near> test(capacity, inserts*collisions[i], collisions[i]);
+        Device::fence();
+        test.print(metrics_out, length_out, distance_out, block_distance_out);
+      }
+      std::cout << "\b\b  " <<  std::endl;
+
+    }
+    std::cout << "  " << wall_clock.seconds() << " secs" << std::endl;
+  }
+  metrics_out.close();
+  length_out.close();
+  distance_out.close();
+  block_distance_out.close();
+#else
+  (void)base_file_name;
+  std::cout << "skipping test" << std::endl;
+#endif
+}
+
+
+} // namespace Perf
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
diff --git a/packages/kokkos/containers/src/CMakeLists.txt b/packages/kokkos/containers/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e68fcad5e945fce8010cd3b0a6ff62298eb77de0
--- /dev/null
+++ b/packages/kokkos/containers/src/CMakeLists.txt
@@ -0,0 +1,47 @@
+
+TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+#-----------------------------------------------------------------------------
+
+SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
+
+if(KOKKOS_LEGACY_TRIBITS)
+
+  SET(HEADERS "")
+  SET(SOURCES "")
+
+  SET(HEADERS_IMPL "")
+
+  FILE(GLOB HEADERS *.hpp)
+  FILE(GLOB HEADERS_IMPL impl/*.hpp)
+  FILE(GLOB SOURCES impl/*.cpp)
+
+  INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/)
+
+  TRIBITS_ADD_LIBRARY(
+      kokkoscontainers
+      HEADERS ${HEADERS}
+      NOINSTALLHEADERS ${HEADERS_IMPL}
+      SOURCES ${SOURCES}
+      DEPLIBS
+      )
+
+else()
+
+  INSTALL (
+      DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
+      DESTINATION ${TRILINOS_INCDIR}
+      FILES_MATCHING PATTERN "*.hpp"
+      )
+
+  TRIBITS_ADD_LIBRARY(
+      kokkoscontainers
+      SOURCES ${KOKKOS_CONTAINERS_SRCS}
+      DEPLIBS
+      )
+
+endif()
+#-----------------------------------------------------------------------------
diff --git a/packages/kokkos/containers/src/Kokkos_Bitset.hpp b/packages/kokkos/containers/src/Kokkos_Bitset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c48058d75d774feaf424cf196522dfceadf9e150
--- /dev/null
+++ b/packages/kokkos/containers/src/Kokkos_Bitset.hpp
@@ -0,0 +1,438 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BITSET_HPP
+#define KOKKOS_BITSET_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Functional.hpp>
+
+#include <impl/Kokkos_Bitset_impl.hpp>
+
+#include <stdexcept>
+
+namespace Kokkos {
+
+template <typename Device = Kokkos::DefaultExecutionSpace >
+class Bitset;
+
+template <typename Device = Kokkos::DefaultExecutionSpace >
+class ConstBitset;
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src);
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+
+/// A thread safe view to a bitset
+template <typename Device>
+class Bitset
+{
+public:
+  typedef Device execution_space;
+  typedef unsigned size_type;
+
+  enum { BIT_SCAN_REVERSE = 1u };
+  enum { MOVE_HINT_BACKWARD = 2u };
+
+  enum {
+      BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u
+    , BIT_SCAN_REVERSE_MOVE_HINT_FORWARD = BIT_SCAN_REVERSE
+    , BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD
+    , BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD
+  };
+
+private:
+  enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
+  enum { block_mask = block_size-1u };
+  enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
+
+public:
+
+
+  /// constructor
+  /// arg_size := number of bit in set
+  Bitset(unsigned arg_size = 0u)
+    : m_size(arg_size)
+    , m_last_block_mask(0u)
+    , m_blocks("Bitset", ((m_size + block_mask) >> block_shift) )
+  {
+    for (int i=0, end = static_cast<int>(m_size & block_mask); i < end; ++i) {
+      m_last_block_mask |= 1u << i;
+    }
+  }
+
+  /// assignment
+  Bitset<Device> & operator = (Bitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_last_block_mask = rhs.m_last_block_mask;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+  /// copy constructor
+  Bitset( Bitset<Device> const & rhs)
+    : m_size( rhs.m_size )
+    , m_last_block_mask( rhs.m_last_block_mask )
+    , m_blocks( rhs.m_blocks )
+  {}
+
+  /// number of bits in the set
+  /// can be call from the host or the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned size() const
+  { return m_size; }
+
+  /// number of bits which are set to 1
+  /// can only be called from the host
+  unsigned count() const
+  {
+    Impl::BitsetCount< Bitset<Device> > f(*this);
+    return f.apply();
+  }
+
+  /// set all bits to 1
+  /// can only be called from the host
+  void set()
+  {
+    Kokkos::deep_copy(m_blocks, ~0u );
+
+    if (m_last_block_mask) {
+      //clear the unused bits in the last block
+      typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+      raw_deep_copy( m_blocks.data() + (m_blocks.extent(0) -1u), &m_last_block_mask, sizeof(unsigned));
+    }
+  }
+
+  /// set all bits to 0
+  /// can only be called from the host
+  void reset()
+  {
+    Kokkos::deep_copy(m_blocks, 0u );
+  }
+
+  /// set all bits to 0
+  /// can only be called from the host
+  void clear()
+  {
+    Kokkos::deep_copy(m_blocks, 0u );
+  }
+
+  /// set i'th bit to 1
+  /// can only be called from the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      unsigned * block_ptr = &m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+
+      return !( atomic_fetch_or( block_ptr, mask ) & mask );
+    }
+    return false;
+  }
+
+  /// set i'th bit to 0
+  /// can only be called from the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool reset( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      unsigned * block_ptr = &m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+
+      return atomic_fetch_and( block_ptr, ~mask ) & mask;
+    }
+    return false;
+  }
+
+  /// return true if the i'th bit set to 1
+  /// can only be called from the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool test( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      const unsigned block = volatile_load(&m_blocks[ i >> block_shift ]);
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+      return block & mask;
+    }
+    return false;
+  }
+
+  /// used with find_any_set_near or find_any_unset_near functions
+  /// returns the max number of times those functions should be call
+  /// when searching for an available bit
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned max_hint() const
+  {
+    return m_blocks.extent(0);
+  }
+
+  /// find a bit set to 1 near the hint
+  /// returns a pair< bool, unsigned> where if result.first is true then result.second is the bit found
+  /// and if result.first is false the result.second is a new hint
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_set_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const
+  {
+    const unsigned block_idx = (hint >> block_shift) < m_blocks.extent(0) ? (hint >> block_shift) : 0;
+    const unsigned offset = hint & block_mask;
+    unsigned block = volatile_load(&m_blocks[ block_idx ]);
+    block = !m_last_block_mask || (block_idx < (m_blocks.extent(0)-1)) ? block : block & m_last_block_mask ;
+
+    return find_any_helper(block_idx, offset, block, scan_direction);
+  }
+
+  /// find a bit set to 0 near the hint
+  /// returns a pair< bool, unsigned> where if result.first is true then result.second is the bit found
+  /// and if result.first is false the result.second is a new hint
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_unset_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const
+  {
+    const unsigned block_idx = hint >> block_shift;
+    const unsigned offset = hint & block_mask;
+    unsigned block = volatile_load(&m_blocks[ block_idx ]);
+    block = !m_last_block_mask || (block_idx < (m_blocks.extent(0)-1) ) ? ~block : ~block & m_last_block_mask ;
+
+    return find_any_helper(block_idx, offset, block, scan_direction);
+  }
+
+private:
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_helper(unsigned block_idx, unsigned offset, unsigned block, unsigned scan_direction) const
+  {
+    Kokkos::pair<bool, unsigned> result( block > 0u, 0);
+
+    if (!result.first) {
+      result.second = update_hint( block_idx, offset, scan_direction );
+    }
+    else {
+      result.second = scan_block(  (block_idx << block_shift)
+                                 , offset
+                                 , block
+                                 , scan_direction
+                                );
+    }
+    return result;
+  }
+
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned scan_block(unsigned block_start, int offset, unsigned block, unsigned scan_direction ) const
+  {
+    offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask;
+    block = Impl::rotate_right(block, offset);
+    return ((( !(scan_direction & BIT_SCAN_REVERSE) ?
+               Impl::bit_scan_forward(block) :
+               Impl::bit_scan_reverse(block)
+             ) + offset
+            ) & block_mask
+           ) + block_start;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned update_hint( long long block_idx, unsigned offset, unsigned scan_direction ) const
+  {
+    block_idx += scan_direction & MOVE_HINT_BACKWARD ? -1 : 1;
+    block_idx = block_idx >= 0 ? block_idx : m_blocks.extent(0) - 1;
+    block_idx = block_idx < static_cast<long long>(m_blocks.extent(0)) ? block_idx : 0;
+
+    return static_cast<unsigned>(block_idx)*block_size + offset;
+  }
+
+private:
+
+  unsigned m_size;
+  unsigned m_last_block_mask;
+  View< unsigned *, execution_space, MemoryTraits<RandomAccess> > m_blocks;
+
+private:
+  template <typename DDevice>
+  friend class Bitset;
+
+  template <typename DDevice>
+  friend class ConstBitset;
+
+  template <typename Bitset>
+  friend struct Impl::BitsetCount;
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src);
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+};
+
+/// a thread-safe view to a const bitset
+/// i.e. can only test bits
+template <typename Device>
+class ConstBitset
+{
+public:
+  typedef Device execution_space;
+  typedef unsigned size_type;
+
+private:
+  enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
+  enum { block_mask = block_size -1u };
+  enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
+
+public:
+  ConstBitset()
+    : m_size (0)
+  {}
+
+  ConstBitset(Bitset<Device> const& rhs)
+    : m_size(rhs.m_size)
+    , m_blocks(rhs.m_blocks)
+  {}
+
+  ConstBitset(ConstBitset<Device> const& rhs)
+    : m_size( rhs.m_size )
+    , m_blocks( rhs.m_blocks )
+  {}
+
+  ConstBitset<Device> & operator = (Bitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+  ConstBitset<Device> & operator = (ConstBitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned size() const
+  {
+    return m_size;
+  }
+
+  unsigned count() const
+  {
+    Impl::BitsetCount< ConstBitset<Device> > f(*this);
+    return f.apply();
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool test( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      const unsigned block = m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+      return block & mask;
+    }
+    return false;
+  }
+
+private:
+
+  unsigned m_size;
+  View< const unsigned *, execution_space, MemoryTraits<RandomAccess> > m_blocks;
+
+private:
+  template <typename DDevice>
+  friend class ConstBitset;
+
+  template <typename Bitset>
+  friend struct Impl::BitsetCount;
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+};
+
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(), sizeof(unsigned)*src.m_blocks.extent(0));
+}
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(), sizeof(unsigned)*src.m_blocks.extent(0));
+}
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(), sizeof(unsigned)*src.m_blocks.extent(0));
+}
+
+} // namespace Kokkos
+
+#endif //KOKKOS_BITSET_HPP
+
diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..74fe4418f870854498cd65cfe8e346d7d183e716
--- /dev/null
+++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp
@@ -0,0 +1,664 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_DualView.hpp
+/// \brief Declaration and definition of Kokkos::DualView.
+///
+/// This header file declares and defines Kokkos::DualView and its
+/// related nonmember functions.
+
+#ifndef KOKKOS_DUALVIEW_HPP
+#define KOKKOS_DUALVIEW_HPP
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+namespace Kokkos {
+
+/* \class DualView
+ * \brief Container to manage mirroring a Kokkos::View that lives
+ *   in device memory with a Kokkos::View that lives in host memory.
+ *
+ * This class provides capabilities to manage data which exists in two
+ * memory spaces at the same time.  It keeps views of the same layout
+ * on two memory spaces as well as modified flags for both
+ * allocations.  Users are responsible for setting the modified flags
+ * manually if they change the data in either memory space, by calling
+ * the sync() method templated on the device where they modified the
+ * data.  Users may synchronize data by calling the modify() function,
+ * templated on the device towards which they want to synchronize
+ * (i.e., the target of the one-way copy operation).
+ *
+ * The DualView class also provides convenience methods such as
+ * realloc, resize and capacity which call the appropriate methods of
+ * the underlying Kokkos::View objects.
+ *
+ * The four template arguments are the same as those of Kokkos::View.
+ * (Please refer to that class' documentation for a detailed
+ * description.)
+ *
+ *   \tparam DataType The type of the entries stored in the container.
+ *
+ *   \tparam Layout The array's layout in memory.
+ *
+ *   \tparam Device The Kokkos Device type.  If its memory space is
+ *     not the same as the host's memory space, then DualView will
+ *     contain two separate Views: one in device memory, and one in
+ *     host memory.  Otherwise, DualView will only store one View.
+ *
+ *   \tparam MemoryTraits (optional) The user's intended memory access
+ *     behavior.  Please see the documentation of Kokkos::View for
+ *     examples.  The default suffices for most users.
+ */
+template< class DataType ,
+          class Arg1Type = void ,
+          class Arg2Type = void ,
+          class Arg3Type = void>
+class DualView : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
+{
+public:
+  //! \name Typedefs for device types and various Kokkos::View specializations.
+  //@{
+  typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
+
+  //! The Kokkos Host Device type;
+  typedef typename traits::host_mirror_space host_mirror_space ;
+
+  //! The type of a Kokkos::View on the device.
+  typedef View< typename traits::data_type ,
+                Arg1Type ,
+                Arg2Type ,
+                Arg3Type > t_dev ;
+
+  /// \typedef t_host
+  /// \brief The type of a Kokkos::View host mirror of \c t_dev.
+  typedef typename t_dev::HostMirror t_host ;
+
+  //! The type of a const View on the device.
+  //! The type of a Kokkos::View on the device.
+  typedef View< typename traits::const_data_type ,
+                Arg1Type ,
+                Arg2Type ,
+                Arg3Type > t_dev_const ;
+
+  /// \typedef t_host_const
+  /// \brief The type of a const View host mirror of \c t_dev_const.
+  typedef typename t_dev_const::HostMirror t_host_const;
+
+  //! The type of a const, random-access View on the device.
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_dev_const_randomread ;
+
+  /// \typedef t_host_const_randomread
+  /// \brief The type of a const, random-access View host mirror of
+  ///   \c t_dev_const_randomread.
+  typedef typename t_dev_const_randomread::HostMirror t_host_const_randomread;
+
+  //! The type of an unmanaged View on the device.
+  typedef View< typename traits::data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                MemoryUnmanaged> t_dev_um;
+
+  //! The type of an unmanaged View host mirror of \c t_dev_um.
+  typedef View< typename t_host::data_type ,
+                typename t_host::array_layout ,
+                typename t_host::device_type ,
+                MemoryUnmanaged> t_host_um;
+
+  //! The type of a const unmanaged View on the device.
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                MemoryUnmanaged> t_dev_const_um;
+
+  //! The type of a const unmanaged View host mirror of \c t_dev_const_um.
+  typedef View<typename t_host::const_data_type,
+               typename t_host::array_layout,
+               typename t_host::device_type,
+               MemoryUnmanaged> t_host_const_um;
+
+  //! The type of a const, random-access View on the device.
+  typedef View< typename t_host::const_data_type ,
+                typename t_host::array_layout ,
+                typename t_host::device_type ,
+                Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > t_dev_const_randomread_um ;
+
+  /// \typedef t_host_const_randomread
+  /// \brief The type of a const, random-access View host mirror of
+  ///   \c t_dev_const_randomread.
+  typedef typename t_dev_const_randomread::HostMirror t_host_const_randomread_um;
+
+  //@}
+  //! \name The two View instances.
+  //@{
+
+  t_dev d_view;
+  t_host h_view;
+
+  //@}
+  //! \name Counters to keep track of changes ("modified" flags)
+  //@{
+
+  View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_device;
+  View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_host;
+
+  //@}
+  //! \name Constructors
+  //@{
+
+  /// \brief Empty constructor.
+  ///
+  /// Both device and host View objects are constructed using their
+  /// default constructors.  The "modified" flags are both initialized
+  /// to "unmodified."
+  DualView () :
+    modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
+    modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
+  {}
+
+  /// \brief Constructor that allocates View objects on both host and device.
+  ///
+  /// This constructor works like the analogous constructor of View.
+  /// The first argument is a string label, which is entirely for your
+  /// benefit.  (Different DualView objects may have the same label if
+  /// you like.)  The arguments that follow are the dimensions of the
+  /// View objects.  For example, if the View has three dimensions,
+  /// the first three integer arguments will be nonzero, and you may
+  /// omit the integer arguments that follow.
+  DualView (const std::string& label,
+            const size_t n0 = 0,
+            const size_t n1 = 0,
+            const size_t n2 = 0,
+            const size_t n3 = 0,
+            const size_t n4 = 0,
+            const size_t n5 = 0,
+            const size_t n6 = 0,
+            const size_t n7 = 0)
+    : d_view (label, n0, n1, n2, n3, n4, n5, n6, n7)
+    , h_view (create_mirror_view (d_view)) // without UVM, host View mirrors
+    , modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device"))
+    , modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
+  {}
+
+  //! Copy constructor (shallow copy)
+  template<class SS, class LS, class DS, class MS>
+  DualView (const DualView<SS,LS,DS,MS>& src) :
+    d_view (src.d_view),
+    h_view (src.h_view),
+    modified_device (src.modified_device),
+    modified_host (src.modified_host)
+  {}
+
+  //! Subview constructor
+  template< class SD, class S1 , class S2 , class S3
+          , class Arg0 , class ... Args >
+  DualView( const DualView<SD,S1,S2,S3> & src
+          , const Arg0 & arg0
+          , Args ... args
+          )
+    : d_view( Kokkos::subview( src.d_view , arg0 , args ... ) )
+    , h_view( Kokkos::subview( src.h_view , arg0 , args ... ) )
+    , modified_device (src.modified_device)
+    , modified_host (src.modified_host)
+    {}
+
+  /// \brief Create DualView from existing device and host View objects.
+  ///
+  /// This constructor assumes that the device and host View objects
+  /// are synchronized.  You, the caller, are responsible for making
+  /// sure this is the case before calling this constructor.  After
+  /// this constructor returns, you may use DualView's sync() and
+  /// modify() methods to ensure synchronization of the View objects.
+  ///
+  /// \param d_view_ Device View
+  /// \param h_view_ Host View (must have type t_host = t_dev::HostMirror)
+  DualView (const t_dev& d_view_, const t_host& h_view_) :
+    d_view (d_view_),
+    h_view (h_view_),
+    modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
+    modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
+  {
+    if ( int(d_view.rank)     != int(h_view.rank) ||
+         d_view.extent(0) != h_view.extent(0) ||
+         d_view.extent(1) != h_view.extent(1) ||
+         d_view.extent(2) != h_view.extent(2) ||
+         d_view.extent(3) != h_view.extent(3) ||
+         d_view.extent(4) != h_view.extent(4) ||
+         d_view.extent(5) != h_view.extent(5) ||
+         d_view.extent(6) != h_view.extent(6) ||
+         d_view.extent(7) != h_view.extent(7) ||
+         d_view.stride_0()    != h_view.stride_0() ||
+         d_view.stride_1()    != h_view.stride_1() ||
+         d_view.stride_2()    != h_view.stride_2() ||
+         d_view.stride_3()    != h_view.stride_3() ||
+         d_view.stride_4()    != h_view.stride_4() ||
+         d_view.stride_5()    != h_view.stride_5() ||
+         d_view.stride_6()    != h_view.stride_6() ||
+         d_view.stride_7()    != h_view.stride_7() ||
+         d_view.span()        != h_view.span() ) {
+      Kokkos::Impl::throw_runtime_exception("DualView constructed with incompatible views");
+    }
+  }
+
+  //@}
+  //! \name Methods for synchronizing, marking as modified, and getting Views.
+  //@{
+
+  /// \brief Return a View on a specific device \c Device.
+  ///
+  /// Please don't be afraid of the if_c expression in the return
+  /// value's type.  That just tells the method what the return type
+  /// should be: t_dev if the \c Device template parameter matches
+  /// this DualView's device type, else t_host.
+  ///
+  /// For example, suppose you create a DualView on Cuda, like this:
+  /// \code
+  /// typedef Kokkos::DualView<float, Kokkos::LayoutRight, Kokkos::Cuda> dual_view_type;
+  /// dual_view_type DV ("my dual view", 100);
+  /// \endcode
+  /// If you want to get the CUDA device View, do this:
+  /// \code
+  /// typename dual_view_type::t_dev cudaView = DV.view<Kokkos::Cuda> ();
+  /// \endcode
+  /// and if you want to get the host mirror of that View, do this:
+  /// \code
+  /// typedef typename Kokkos::HostSpace::execution_space host_device_type;
+  /// typename dual_view_type::t_host hostView = DV.view<host_device_type> ();
+  /// \endcode
+  template< class Device >
+  KOKKOS_INLINE_FUNCTION
+  const typename Impl::if_c<
+    std::is_same<typename t_dev::memory_space,
+                          typename Device::memory_space>::value,
+    t_dev,
+    t_host>::type& view () const
+  {
+    return Impl::if_c<
+      std::is_same<
+        typename t_dev::memory_space,
+        typename Device::memory_space>::value,
+      t_dev,
+      t_host >::select (d_view , h_view);
+  }
+
+  /// \brief Update data on device or host only if data in the other
+  ///   space has been marked as modified.
+  ///
+  /// If \c Device is the same as this DualView's device type, then
+  /// copy data from host to device.  Otherwise, copy data from device
+  /// to host.  In either case, only copy if the source of the copy
+  /// has been modified.
+  ///
+  /// This is a one-way synchronization only.  If the target of the
+  /// copy has been modified, this operation will discard those
+  /// modifications.  It will also reset both device and host modified
+  /// flags.
+  ///
+  /// \note This method doesn't know on its own whether you modified
+  ///   the data in either View.  You must manually mark modified data
+  ///   as modified, by calling the modify() method with the
+  ///   appropriate template parameter.
+  template<class Device>
+  void sync( const typename Impl::enable_if<
+        ( std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value) ||
+        ( std::is_same< Device , int>::value)
+        , int >::type& = 0)
+  {
+    const unsigned int dev =
+      Impl::if_c<
+        std::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value ,
+        unsigned int,
+        unsigned int>::select (1, 0);
+
+    if (dev) { // if Device is the same as DualView's device type
+      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+        deep_copy (d_view, h_view);
+        modified_host() = modified_device() = 0;
+      }
+    } else { // hopefully Device is the same as DualView's host type
+      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+        deep_copy (h_view, d_view);
+        modified_host() = modified_device() = 0;
+      }
+    }
+    if(std::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
+      t_dev::execution_space::fence();
+      t_host::execution_space::fence();
+    }
+  }
+
+  template<class Device>
+  void sync ( const typename Impl::enable_if<
+      ( ! std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value ) ||
+      ( std::is_same< Device , int>::value)
+      , int >::type& = 0 )
+  {
+    const unsigned int dev =
+      Impl::if_c<
+        std::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value,
+        unsigned int,
+        unsigned int>::select (1, 0);
+    if (dev) { // if Device is the same as DualView's device type
+      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+        Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
+      }
+    } else { // hopefully Device is the same as DualView's host type
+      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+        Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
+      }
+    }
+  }
+
+  template<class Device>
+  bool need_sync() const
+  {
+    const unsigned int dev =
+      Impl::if_c<
+        std::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value ,
+        unsigned int,
+        unsigned int>::select (1, 0);
+
+    if (dev) { // if Device is the same as DualView's device type
+      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+        return true;
+      }
+    } else { // hopefully Device is the same as DualView's host type
+      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+        return true;
+      }
+    }
+    return false;
+  }
+  /// \brief Mark data as modified on the given device \c Device.
+  ///
+  /// If \c Device is the same as this DualView's device type, then
+  /// mark the device's data as modified.  Otherwise, mark the host's
+  /// data as modified.
+  template<class Device>
+  void modify () {
+    const unsigned int dev =
+      Impl::if_c<
+        std::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value,
+        unsigned int,
+        unsigned int>::select (1, 0);
+
+    if (dev) { // if Device is the same as DualView's device type
+      // Increment the device's modified count.
+      modified_device () = (modified_device () > modified_host () ?
+                            modified_device () : modified_host ()) + 1;
+    } else { // hopefully Device is the same as DualView's host type
+      // Increment the host's modified count.
+      modified_host () = (modified_device () > modified_host () ?
+                          modified_device () : modified_host ())  + 1;
+    }
+
+#ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
+    if (modified_host() && modified_device()) {
+      std::string msg = "Kokkos::DualView::modify ERROR: ";
+      msg += "Concurrent modification of host and device views ";
+      msg += "in DualView \"";
+      msg += d_view.label();
+      msg += "\"\n";
+      Kokkos::abort(msg.c_str());
+    }
+#endif
+  }
+
+  //@}
+  //! \name Methods for reallocating or resizing the View objects.
+  //@{
+
+  /// \brief Reallocate both View objects.
+  ///
+  /// This discards any existing contents of the objects, and resets
+  /// their modified flags.  It does <i>not</i> copy the old contents
+  /// of either View into the new View objects.
+  void realloc( const size_t n0 = 0 ,
+           const size_t n1 = 0 ,
+           const size_t n2 = 0 ,
+           const size_t n3 = 0 ,
+           const size_t n4 = 0 ,
+           const size_t n5 = 0 ,
+           const size_t n6 = 0 ,
+           const size_t n7 = 0 ) {
+    ::Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+     h_view = create_mirror_view( d_view );
+
+     /* Reset dirty flags */
+     modified_device() = modified_host() = 0;
+  }
+
+  /// \brief Resize both views, copying old contents into new if necessary.
+  ///
+  /// This method only copies the old contents into the new View
+  /// objects for the device which was last marked as modified.
+  void resize( const size_t n0 = 0 ,
+           const size_t n1 = 0 ,
+           const size_t n2 = 0 ,
+           const size_t n3 = 0 ,
+           const size_t n4 = 0 ,
+           const size_t n5 = 0 ,
+           const size_t n6 = 0 ,
+           const size_t n7 = 0 ) {
+   if(modified_device() >= modified_host()) {
+     /* Resize on Device */
+     ::Kokkos::resize(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+     h_view = create_mirror_view( d_view );
+
+     /* Mark Device copy as modified */
+     modified_device() = modified_device()+1;
+
+   } else {
+     /* Realloc on Device */
+
+     ::Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+
+     const bool sizeMismatch = ( h_view.extent(0) != n0 ) ||
+         ( h_view.extent(1) != n1 ) ||
+         ( h_view.extent(2) != n2 ) ||
+         ( h_view.extent(3) != n3 ) ||
+         ( h_view.extent(4) != n4 ) ||
+         ( h_view.extent(5) != n5 ) ||
+         ( h_view.extent(6) != n6 ) ||
+         ( h_view.extent(7) != n7 );
+     if ( sizeMismatch )
+       ::Kokkos::resize(h_view,n0,n1,n2,n3,n4,n5,n6,n7);
+
+     t_host temp_view = create_mirror_view( d_view );
+
+     /* Remap on Host */
+     Kokkos::deep_copy( temp_view , h_view );
+
+     h_view = temp_view;
+
+     d_view = create_mirror_view( typename t_dev::execution_space(), h_view );
+
+     /* Mark Host copy as modified */
+     modified_host() = modified_host()+1;
+   }
+  }
+
+  //@}
+  //! \name Methods for getting capacity, stride, or dimension(s).
+  //@{
+
+  //! The allocation size (same as Kokkos::View::capacity).
+  size_t capacity() const {
+    return d_view.span();
+  }
+
+  //! Get stride(s) for each dimension.
+  template< typename iType>
+  void stride(iType* stride_) const {
+    d_view.stride(stride_);
+  }
+
+  template< typename iType >
+   KOKKOS_INLINE_FUNCTION constexpr
+   typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+   extent( const iType & r ) const
+     { return d_view.extent(r); }
+
+   template< typename iType >
+   KOKKOS_INLINE_FUNCTION constexpr
+   typename std::enable_if< std::is_integral<iType>::value , int >::type
+   extent_int( const iType & r ) const
+     { return static_cast<int>(d_view.extent(r)); }
+
+  /* \brief return size of dimension 0 */
+  size_t dimension_0() const {return d_view.extent(0);}
+  /* \brief return size of dimension 1 */
+  size_t dimension_1() const {return d_view.extent(1);}
+  /* \brief return size of dimension 2 */
+  size_t dimension_2() const {return d_view.extent(2);}
+  /* \brief return size of dimension 3 */
+  size_t dimension_3() const {return d_view.extent(3);}
+  /* \brief return size of dimension 4 */
+  size_t dimension_4() const {return d_view.extent(4);}
+  /* \brief return size of dimension 5 */
+  size_t dimension_5() const {return d_view.extent(5);}
+  /* \brief return size of dimension 6 */
+  size_t dimension_6() const {return d_view.extent(6);}
+  /* \brief return size of dimension 7 */
+  size_t dimension_7() const {return d_view.extent(7);}
+
+  //@}
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+//
+// Partial specializations of Kokkos::subview() for DualView objects.
+//
+
+namespace Kokkos {
+namespace Impl {
+
+template< class D, class A1, class A2, class A3, class ... Args >
+struct DualViewSubview {
+
+  typedef typename Kokkos::Impl::ViewMapping
+    < void
+    , Kokkos::ViewTraits< D, A1, A2, A3 >
+    , Args ...
+    >::traits_type dst_traits ;
+
+  typedef Kokkos::DualView
+    < typename dst_traits::data_type
+    , typename dst_traits::array_layout
+    , typename dst_traits::device_type
+    , typename dst_traits::memory_traits
+    > type ;
+};
+
+} /* namespace Impl */
+
+
+template< class D , class A1 , class A2 , class A3 , class ... Args >
+typename Impl::DualViewSubview<D,A1,A2,A3,Args...>::type
+subview( const DualView<D,A1,A2,A3> & src , Args ... args )
+{
+  return typename
+    Impl::DualViewSubview<D,A1,A2,A3,Args...>::type( src , args ... );
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//
+// Partial specialization of Kokkos::deep_copy() for DualView objects.
+//
+
+template< class DT , class DL , class DD , class DM ,
+          class ST , class SL , class SD , class SM >
+void
+deep_copy (DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
+           const DualView<ST,SL,SD,SM>& src )
+{
+  if (src.modified_device () >= src.modified_host ()) {
+    deep_copy (dst.d_view, src.d_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
+  } else {
+    deep_copy (dst.h_view, src.h_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_space> ();
+  }
+}
+
+template< class ExecutionSpace ,
+          class DT , class DL , class DD , class DM ,
+          class ST , class SL , class SD , class SM >
+void
+deep_copy (const ExecutionSpace& exec ,
+           DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
+           const DualView<ST,SL,SD,SM>& src )
+{
+  if (src.modified_device () >= src.modified_host ()) {
+    deep_copy (exec, dst.d_view, src.d_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
+  } else {
+    deep_copy (exec, dst.h_view, src.h_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_space> ();
+  }
+}
+
+} // namespace Kokkos
+
+#endif
+
diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ccf53b3d5096574f4446d4cf9d7eee71b6e45c08
--- /dev/null
+++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -0,0 +1,2034 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_DynRankView.hpp
+/// \brief Declaration and definition of Kokkos::DynRankView.
+///
+/// This header file declares and defines Kokkos::DynRankView and its
+/// related nonmember functions.
+
+#ifndef KOKKOS_DYNRANKVIEW_HPP
+#define KOKKOS_DYNRANKVIEW_HPP
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <type_traits>
+
+namespace Kokkos {
+
+template< typename DataType , class ... Properties >
+class DynRankView;  //forward declare
+
+namespace Impl {
+
+template <typename Specialize>
+struct DynRankDimTraits {
+
+  enum : size_t{unspecified = ~size_t(0)};
+
+  // Compute the rank of the view from the nonzero dimension arguments.
+  KOKKOS_INLINE_FUNCTION
+  static size_t computeRank( const size_t N0
+                           , const size_t N1
+                           , const size_t N2
+                           , const size_t N3
+                           , const size_t N4
+                           , const size_t N5
+                           , const size_t N6
+                           , const size_t N7 )
+  {
+    return
+      (   (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified && N1 == unspecified && N0 == unspecified) ? 0
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified && N1 == unspecified) ? 1
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified) ? 2
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified) ? 3
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified) ? 4
+      : ( (N6 == unspecified && N5 == unspecified) ? 5
+      : ( (N6 == unspecified) ? 6
+      : 7 ) ) ) ) ) ) );
+  }
+
+  // Compute the rank of the view from the nonzero layout arguments.
+  template <typename Layout>
+  KOKKOS_INLINE_FUNCTION
+  static size_t computeRank( const Layout& layout )
+  {
+    return computeRank( layout.dimension[0]
+                      , layout.dimension[1]
+                      , layout.dimension[2]
+                      , layout.dimension[3]
+                      , layout.dimension[4]
+                      , layout.dimension[5]
+                      , layout.dimension[6]
+                      , layout.dimension[7] );
+  }
+
+  // Extra overload to match that for specialize types v2
+  template <typename Layout, typename ... P>
+  KOKKOS_INLINE_FUNCTION
+  static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout )
+  {
+    return computeRank(layout);
+  }
+
+  // Create the layout for the rank-7 view.
+  // Non-strided Layout
+  template <typename Layout>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) , Layout >::type createLayout( const Layout& layout )
+  {
+    return Layout( layout.dimension[0] != unspecified ? layout.dimension[0] : 1
+                 , layout.dimension[1] != unspecified ? layout.dimension[1] : 1
+                 , layout.dimension[2] != unspecified ? layout.dimension[2] : 1
+                 , layout.dimension[3] != unspecified ? layout.dimension[3] : 1
+                 , layout.dimension[4] != unspecified ? layout.dimension[4] : 1
+                 , layout.dimension[5] != unspecified ? layout.dimension[5] : 1
+                 , layout.dimension[6] != unspecified ? layout.dimension[6] : 1
+                 , layout.dimension[7] != unspecified ? layout.dimension[7] : 1
+                 );
+  }
+
+  // LayoutStride
+  template <typename Layout>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) , Layout>::type createLayout( const Layout& layout )
+  {
+    return Layout( layout.dimension[0] != unspecified ? layout.dimension[0] : 1
+                 , layout.stride[0]
+                 , layout.dimension[1] != unspecified ? layout.dimension[1] : 1
+                 , layout.stride[1]
+                 , layout.dimension[2] != unspecified ? layout.dimension[2] : 1
+                 , layout.stride[2]
+                 , layout.dimension[3] != unspecified ? layout.dimension[3] : 1
+                 , layout.stride[3]
+                 , layout.dimension[4] != unspecified ? layout.dimension[4] : 1
+                 , layout.stride[4]
+                 , layout.dimension[5] != unspecified ? layout.dimension[5] : 1
+                 , layout.stride[5]
+                 , layout.dimension[6] != unspecified ? layout.dimension[6] : 1
+                 , layout.stride[6]
+                 , layout.dimension[7] != unspecified ? layout.dimension[7] : 1
+                 , layout.stride[7]
+                 );
+  }
+
+  // Extra overload to match that for specialize types
+  template <typename Traits, typename ... P>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const Kokkos::Impl::ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout )
+  {
+    return createLayout( layout );
+  }
+
+  // Create a view from the given dimension arguments.
+  // This is only necessary because the shmem constructor doesn't take a layout.
+  //   NDE shmem View's are not compatible with the added view_alloc value_type / fad_dim deduction functionality
+  template <typename ViewType, typename ViewArg>
+  static ViewType createView( const ViewArg& arg
+                            , const size_t N0
+                            , const size_t N1
+                            , const size_t N2
+                            , const size_t N3
+                            , const size_t N4
+                            , const size_t N5
+                            , const size_t N6
+                            , const size_t N7 )
+  {
+    return ViewType( arg
+                   , N0 != unspecified ? N0 : 1
+                   , N1 != unspecified ? N1 : 1
+                   , N2 != unspecified ? N2 : 1
+                   , N3 != unspecified ? N3 : 1
+                   , N4 != unspecified ? N4 : 1
+                   , N5 != unspecified ? N5 : 1
+                   , N6 != unspecified ? N6 : 1
+                   , N7 != unspecified ? N7 : 1 );
+  }
+};
+
+  // Non-strided Layout
+  template <typename Layout , typename iType>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type
+  reconstructLayout( const Layout& layout , iType dynrank )
+  {
+    return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
+                 , dynrank > 1 ? layout.dimension[1] : ~size_t(0)
+                 , dynrank > 2 ? layout.dimension[2] : ~size_t(0)
+                 , dynrank > 3 ? layout.dimension[3] : ~size_t(0)
+                 , dynrank > 4 ? layout.dimension[4] : ~size_t(0)
+                 , dynrank > 5 ? layout.dimension[5] : ~size_t(0)
+                 , dynrank > 6 ? layout.dimension[6] : ~size_t(0)
+                 , dynrank > 7 ? layout.dimension[7] : ~size_t(0)
+                 );
+  }
+
+  // LayoutStride
+  template <typename Layout , typename iType>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type
+  reconstructLayout( const Layout& layout , iType dynrank )
+  {
+    return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
+                 , dynrank > 0 ? layout.stride[0] : (0)
+                 , dynrank > 1 ? layout.dimension[1] : ~size_t(0)
+                 , dynrank > 1 ? layout.stride[1] : (0)
+                 , dynrank > 2 ? layout.dimension[2] : ~size_t(0)
+                 , dynrank > 2 ? layout.stride[2] : (0)
+                 , dynrank > 3 ? layout.dimension[3] : ~size_t(0)
+                 , dynrank > 3 ? layout.stride[3] : (0)
+                 , dynrank > 4 ? layout.dimension[4] : ~size_t(0)
+                 , dynrank > 4 ? layout.stride[4] : (0)
+                 , dynrank > 5 ? layout.dimension[5] : ~size_t(0)
+                 , dynrank > 5 ? layout.stride[5] : (0)
+                 , dynrank > 6 ? layout.dimension[6] : ~size_t(0)
+                 , dynrank > 6 ? layout.stride[6] : (0)
+                 , dynrank > 7 ? layout.dimension[7] : ~size_t(0)
+                 , dynrank > 7 ? layout.stride[7] : (0)
+                 );
+  }
+
+
+/** \brief  Debug bounds-checking routines */
+// Enhanced debug checking - most infrastructure matches that of functions in
+// Kokkos_ViewMapping; additional checks for extra arguments beyond rank are 0
+template< unsigned ,  typename iType0 , class MapType >
+KOKKOS_INLINE_FUNCTION
+bool dyn_rank_view_verify_operator_bounds( const iType0 & , const MapType & )
+{ return true ; }
+
+template< unsigned R , typename iType0 ,  class MapType , typename iType1 , class ... Args >
+KOKKOS_INLINE_FUNCTION
+bool dyn_rank_view_verify_operator_bounds
+  ( const iType0  & rank
+  , const MapType & map
+  , const iType1  & i
+  , Args ... args
+  )
+{
+  if ( static_cast<iType0>(R) < rank ) {
+    return ( size_t(i) < map.extent(R) )
+       && dyn_rank_view_verify_operator_bounds<R+1>( rank , map , args ... );
+  }
+  else if ( i != 0 ) {
+    printf("DynRankView Debug Bounds Checking Error: at rank %u\n  Extra arguments beyond the rank must be zero \n",R);
+    return ( false )
+       && dyn_rank_view_verify_operator_bounds<R+1>( rank , map , args ... );
+  }
+  else {
+    return ( true )
+       && dyn_rank_view_verify_operator_bounds<R+1>( rank , map , args ... );
+  }
+}
+
+template< unsigned , class MapType >
+inline
+void dyn_rank_view_error_operator_bounds( char * , int , const MapType & )
+{}
+
+template< unsigned R , class MapType , class iType , class ... Args >
+inline
+void dyn_rank_view_error_operator_bounds
+  ( char * buf
+  , int len
+  , const MapType & map
+  , const iType   & i
+  , Args ... args
+  )
+{
+  const int n =
+    snprintf(buf,len," %ld < %ld %c"
+            , static_cast<unsigned long>(i)
+            , static_cast<unsigned long>( map.extent(R) )
+            , ( sizeof...(Args) ? ',' : ')' )
+            );
+  dyn_rank_view_error_operator_bounds<R+1>(buf+n,len-n,map,args...);
+}
+
+// op_rank = rank of the operator version that was called
+template< typename MemorySpace
+        , typename iType0 , typename iType1 ,  class MapType , class ... Args >
+KOKKOS_INLINE_FUNCTION
+void dyn_rank_view_verify_operator_bounds
+  ( const iType0 & op_rank , const iType1 & rank
+  , const Kokkos::Impl::SharedAllocationTracker & tracker
+  , const MapType & map , Args ... args )
+{
+  if ( static_cast<iType0>(rank) > op_rank ) {
+    Kokkos::abort( "DynRankView Bounds Checking Error: Need at least rank arguments to the operator()" );
+  }
+
+  if ( ! dyn_rank_view_verify_operator_bounds<0>( rank , map , args ... ) ) {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    enum { LEN = 1024 };
+    char buffer[ LEN ];
+    const std::string label = tracker.template get_label<MemorySpace>();
+    int n = snprintf(buffer,LEN,"DynRankView bounds error of view %s (", label.c_str());
+    dyn_rank_view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... );
+    Kokkos::Impl::throw_runtime_exception(std::string(buffer));
+#else
+    Kokkos::abort("DynRankView bounds error");
+#endif
+  }
+}
+
+
+/** \brief  Assign compatible default mappings */
+struct ViewToDynRankViewTag {};
+
+} // namespace Impl
+
+namespace Impl {
+
+template< class DstTraits , class SrcTraits >
+class ViewMapping< DstTraits , SrcTraits ,
+  typename std::enable_if<(
+    std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value
+    &&
+    std::is_same< typename DstTraits::specialize , void >::value
+    &&
+    std::is_same< typename SrcTraits::specialize , void >::value
+    &&
+    (
+      std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value
+      ||
+      (
+        (
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+        &&
+        (
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+      )
+    )
+  ) , Kokkos::Impl::ViewToDynRankViewTag >::type >
+{
+private:
+
+  enum { is_assignable_value_type =
+    std::is_same< typename DstTraits::value_type
+                , typename SrcTraits::value_type >::value ||
+    std::is_same< typename DstTraits::value_type
+                , typename SrcTraits::const_value_type >::value };
+
+  enum { is_assignable_layout =
+    std::is_same< typename DstTraits::array_layout
+                , typename SrcTraits::array_layout >::value ||
+    std::is_same< typename DstTraits::array_layout
+                , Kokkos::LayoutStride >::value
+    };
+
+public:
+
+  enum { is_assignable = is_assignable_value_type &&
+                         is_assignable_layout };
+
+  typedef ViewMapping< DstTraits , void >  DstType ;
+  typedef ViewMapping< SrcTraits , void >  SrcType ;
+
+  template < typename DT , typename ... DP , typename ST , typename ... SP >
+  KOKKOS_INLINE_FUNCTION
+  static void assign( Kokkos::DynRankView< DT , DP...> & dst ,  const Kokkos::View< ST , SP... > & src )
+    {
+      static_assert( is_assignable_value_type
+                   , "View assignment must have same value type or const = non-const" );
+
+      static_assert( is_assignable_layout
+                   , "View assignment must have compatible layout or have rank <= 1" );
+
+    // Removed dimension checks...
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+      dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
+      dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
+      dst.m_track.assign( src.m_track , DstTraits::is_managed );
+      dst.m_rank = src.Rank ;
+    }
+};
+
+} //end Impl
+
+/* \class DynRankView
+ * \brief Container that creates a Kokkos view with rank determined at runtime.
+ *   Essentially this is a rank 7 view
+ *
+ *   Changes from View
+ *   1. The rank of the DynRankView is returned by the method rank()
+ *   2. Max rank of a DynRankView is 7
+ *   3. subview called with 'subview(...)' or 'subdynrankview(...)' (backward compatibility) 
+ *   4. Every subview is returned with LayoutStride
+ *   5. Copy and Copy-Assign View to DynRankView
+ *   6. deep_copy between Views and DynRankViews
+ *   7. rank( view ); returns the rank of View or DynRankView
+ *
+ */
+
+template< class > struct is_dyn_rank_view : public std::false_type {};
+
+template< class D, class ... P >
+struct is_dyn_rank_view< Kokkos::DynRankView<D,P...> > : public std::true_type {};
+
+
+template< typename DataType , class ... Properties >
+class DynRankView : public ViewTraits< DataType , Properties ... >
+{
+  static_assert( !std::is_array<DataType>::value && !std::is_pointer<DataType>::value , "Cannot template DynRankView with array or pointer datatype - must be pod" );
+
+private:
+  template < class , class ... > friend class DynRankView ;
+  template < class , class ... > friend class Kokkos::Impl::ViewMapping ;
+
+public:
+  typedef ViewTraits< DataType , Properties ... > drvtraits ;
+
+  typedef View< DataType******* , Properties...> view_type ;
+
+  typedef ViewTraits< DataType******* , Properties ... > traits ;
+
+
+private:
+  typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
+  typedef Kokkos::Impl::SharedAllocationTracker      track_type ;
+
+  track_type  m_track ;
+  map_type    m_map ;
+  unsigned m_rank;
+
+public:
+  KOKKOS_INLINE_FUNCTION
+  view_type & DownCast() const { return ( view_type & ) (*this); }
+  KOKKOS_INLINE_FUNCTION
+  const view_type & ConstDownCast() const { return (const view_type & ) (*this); }
+
+  //Types below - at least the HostMirror requires the value_type, NOT the rank 7 data_type of the traits
+
+  /** \brief  Compatible view of array of scalar types */
+  typedef DynRankView< typename drvtraits::scalar_array_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::device_type ,
+                typename drvtraits::memory_traits >
+    array_type ;
+
+  /** \brief  Compatible view of const data type */
+  typedef DynRankView< typename drvtraits::const_data_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::device_type ,
+                typename drvtraits::memory_traits >
+    const_type ;
+
+  /** \brief  Compatible view of non-const data type */
+  typedef DynRankView< typename drvtraits::non_const_data_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::device_type ,
+                typename drvtraits::memory_traits >
+    non_const_type ;
+
+  /** \brief  Compatible HostMirror view */
+  typedef DynRankView< typename drvtraits::non_const_data_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::host_mirror_space >
+    HostMirror ;
+
+
+  //----------------------------------------
+  // Domain rank and extents
+
+//  enum { Rank = map_type::Rank }; //Will be dyn rank of 7 always, keep the enum?
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  extent( const iType & r ) const
+    { return m_map.extent(r); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , int >::type
+  extent_int( const iType & r ) const
+    { return static_cast<int>(m_map.extent(r)); }
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename traits::array_layout layout() const
+    { return m_map.layout(); }
+
+  //----------------------------------------
+  /*  Deprecate all 'dimension' functions in favor of
+   *  ISO/C++ vocabulary 'extent'.
+   */
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  dimension( const iType & r ) const { return extent( r ); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return m_map.dimension_0() *
+                                                                m_map.dimension_1() *
+                                                                m_map.dimension_2() *
+                                                                m_map.dimension_3() *
+                                                                m_map.dimension_4() *
+                                                                m_map.dimension_5() *
+                                                                m_map.dimension_6() *
+                                                                m_map.dimension_7(); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_map.stride(s); }
+
+  //----------------------------------------
+  // Range span is the span which contains all members.
+
+  typedef typename map_type::reference_type  reference_type ;
+  typedef typename map_type::pointer_type    pointer_type ;
+
+  enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value };
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); }
+  // Deprecated, use 'span()' instead
+  KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const { return m_map.span(); }
+  KOKKOS_INLINE_FUNCTION constexpr bool   span_is_contiguous() const { return m_map.span_is_contiguous(); }
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return m_map.data(); }
+
+  // Deprecated, use 'span_is_contigous()' instead
+  KOKKOS_INLINE_FUNCTION constexpr bool   is_contiguous() const { return m_map.span_is_contiguous(); }
+  // Deprecated, use 'data()' instead
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const { return m_map.data(); }
+
+  //----------------------------------------
+  // Allow specializations to query their specialized map
+
+  KOKKOS_INLINE_FUNCTION
+  const Kokkos::Impl::ViewMapping< traits , void > &
+  implementation_map() const { return m_map ; }
+
+  //----------------------------------------
+
+private:
+
+  enum {
+    is_layout_left = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutLeft >::value ,
+
+    is_layout_right = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutRight >::value ,
+
+    is_layout_stride = std::is_same< typename traits::array_layout
+                                   , Kokkos::LayoutStride >::value ,
+
+    is_default_map =
+      std::is_same< typename traits::specialize , void >::value &&
+      ( is_layout_left || is_layout_right || is_layout_stride )
+  };
+
+  template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space
+    { KOKKOS_FORCEINLINE_FUNCTION static void check() {} };
+
+  template< class Space > struct verify_space<Space,false>
+    { KOKKOS_FORCEINLINE_FUNCTION static void check()
+        { Kokkos::abort("Kokkos::DynRankView ERROR: attempt to access inaccessible memory space"); };
+    };
+
+// Bounds checking macros
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+
+// rank of the calling operator - included as first argument in ARG
+#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( ARG ) \
+  DynRankView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); \
+  Kokkos::Impl::dyn_rank_view_verify_operator_bounds< typename traits::memory_space > ARG ;
+
+#else
+
+#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( ARG ) \
+  DynRankView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
+
+#endif
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr unsigned rank() const { return m_rank; }
+
+
+  //operators ()
+  // Rank 0
+  KOKKOS_INLINE_FUNCTION
+  reference_type operator()() const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (0 , this->rank(), m_track, m_map) )
+      return implementation_map().reference();
+      //return m_map.reference(0,0,0,0,0,0,0);
+    }
+
+  // Rank 1
+  // This assumes a contiguous underlying memory (i.e. no padding, no striding...)
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< std::is_same<typename drvtraits::value_type, typename drvtraits::scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type
+  operator[](const iType & i0) const
+    {
+      //Phalanx is violating this, since they use the operator to access ALL elements in the allocation
+      //KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , this->rank(), m_track, m_map) )
+      return data()[i0];
+    }
+
+  // This assumes a contiguous underlying memory (i.e. no padding, no striding...
+  // AND a Trilinos/Sacado scalar type )
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !std::is_same<typename drvtraits::value_type, typename drvtraits::scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type
+  operator[](const iType & i0) const
+    {
+//      auto map = implementation_map();
+      const size_t dim_scalar = m_map.dimension_scalar();
+      const size_t bytes = this->span() / dim_scalar;
+
+      typedef Kokkos::View<DataType*, typename traits::array_layout, typename traits::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged | traits::memory_traits::RandomAccess | traits::memory_traits::Atomic> > tmp_view_type;
+      tmp_view_type rankone_view(this->data(), bytes, dim_scalar);
+      return rankone_view(i0);
+    }
+
+  // Rank 1 parenthesis
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
+  operator()(const iType & i0 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , this->rank(), m_track, m_map, i0) )
+      return m_map.reference(i0);
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
+  operator()(const iType & i0 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , this->rank(), m_track, m_map, i0) )
+      return m_map.reference(i0,0,0,0,0,0,0);
+    }
+
+  // Rank 2
+  template< typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (2 , this->rank(), m_track, m_map, i0, i1) )
+      return m_map.reference(i0,i1);
+    }
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (2 , this->rank(), m_track, m_map, i0, i1) )
+      return m_map.reference(i0,i1,0,0,0,0,0);
+    }
+
+  // Rank 3
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (3 , this->rank(), m_track, m_map, i0, i1, i2) )
+      return m_map.reference(i0,i1,i2);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (3 , this->rank(), m_track, m_map, i0, i1, i2) )
+      return m_map.reference(i0,i1,i2,0,0,0,0);
+    }
+
+  // Rank 4
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (4 , this->rank(), m_track, m_map, i0, i1, i2, i3) )
+      return m_map.reference(i0,i1,i2,i3);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (4 , this->rank(), m_track, m_map, i0, i1, i2, i3) )
+      return m_map.reference(i0,i1,i2,i3,0,0,0);
+    }
+
+  // Rank 5
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (5 , this->rank(), m_track, m_map, i0, i1, i2, i3, i4) )
+      return m_map.reference(i0,i1,i2,i3,i4);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (5 , this->rank(), m_track, m_map, i0, i1, i2, i3, i4) )
+      return m_map.reference(i0,i1,i2,i3,i4,0,0);
+    }
+
+  // Rank 6
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (6 , this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (6 , this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,0);
+    }
+
+  // Rank 7
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 , typename iType6 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value && std::is_integral<iType6>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (7 , this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
+    }
+
+  // Rank 0
+  KOKKOS_INLINE_FUNCTION
+  reference_type access() const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (0 , this->rank(), m_track, m_map) )
+      return implementation_map().reference();
+      //return m_map.reference(0,0,0,0,0,0,0);
+    }
+
+  // Rank 1
+   // Rank 1 parenthesis
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
+  access(const iType & i0 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , this->rank(), m_track, m_map, i0) )
+      return m_map.reference(i0);
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
+  access(const iType & i0 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , this->rank(), m_track, m_map, i0) )
+      return m_map.reference(i0,0,0,0,0,0,0);
+    }
+
+  // Rank 2
+  template< typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value), reference_type>::type
+ access(const iType0 & i0 , const iType1 & i1 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (2 , this->rank(), m_track, m_map, i0, i1) )
+      return m_map.reference(i0,i1);
+    }
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  access(const iType0 & i0 , const iType1 & i1 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (2 , this->rank(), m_track, m_map, i0, i1) )
+      return m_map.reference(i0,i1,0,0,0,0,0);
+    }
+
+  // Rank 3
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value), reference_type>::type
+  access(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (3 , this->rank(), m_track, m_map, i0, i1, i2) )
+      return m_map.reference(i0,i1,i2);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  access(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (3 , this->rank(), m_track, m_map, i0, i1, i2) )
+      return m_map.reference(i0,i1,i2,0,0,0,0);
+    }
+
+  // Rank 4
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value), reference_type>::type
+  access(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (4 , this->rank(), m_track, m_map, i0, i1, i2, i3) )
+      return m_map.reference(i0,i1,i2,i3);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  access(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (4 , this->rank(), m_track, m_map, i0, i1, i2, i3) )
+      return m_map.reference(i0,i1,i2,i3,0,0,0);
+    }
+
+  // Rank 5
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value), reference_type>::type
+  access(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (5 , this->rank(), m_track, m_map, i0, i1, i2, i3, i4) )
+      return m_map.reference(i0,i1,i2,i3,i4);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  access(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (5 , this->rank(), m_track, m_map, i0, i1, i2, i3, i4) )
+      return m_map.reference(i0,i1,i2,i3,i4,0,0);
+    }
+
+  // Rank 6
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value), reference_type>::type
+  access(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (6 , this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  access(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (6 , this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,0);
+    }
+
+  // Rank 7
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 , typename iType6 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value && std::is_integral<iType6>::value), reference_type>::type
+  access(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (7 , this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
+    }
+
+#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY
+
+  //----------------------------------------
+  // Standard constructor, destructor, and assignment operators...
+
+  KOKKOS_INLINE_FUNCTION
+  ~DynRankView() {}
+
+  KOKKOS_INLINE_FUNCTION
+  DynRankView() : m_track(), m_map(), m_rank() {} //Default ctor
+
+  KOKKOS_INLINE_FUNCTION
+  DynRankView( const DynRankView & rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ), m_rank(rhs.m_rank) {}
+
+  KOKKOS_INLINE_FUNCTION
+  DynRankView( DynRankView && rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ), m_rank(rhs.m_rank) {}
+
+  KOKKOS_INLINE_FUNCTION
+  DynRankView & operator = ( const DynRankView & rhs ) { m_track = rhs.m_track; m_map = rhs.m_map; m_rank = rhs.m_rank; return *this; }
+
+  KOKKOS_INLINE_FUNCTION
+  DynRankView & operator = ( DynRankView && rhs ) { m_track = rhs.m_track; m_map = rhs.m_map; m_rank = rhs.m_rank; return *this; }
+
+  //----------------------------------------
+  // Compatible view copy constructor and assignment
+  // may assign unmanaged from managed.
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  DynRankView( const DynRankView<RT,RP...> & rhs )
+    : m_track( rhs.m_track , traits::is_managed )
+    , m_map()
+    , m_rank(rhs.m_rank)
+    {
+      typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+    }
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
+    {
+      typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+      m_track.assign( rhs.m_track , traits::is_managed );
+      m_rank = rhs.rank();
+      return *this;
+    }
+
+// Copy/Assign View to DynRankView
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  DynRankView( const View<RT,RP...> & rhs )
+    : m_track()
+    , m_map()
+    , m_rank( rhs.Rank )
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Impl::ViewToDynRankViewTag >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
+      Mapping::assign( *this , rhs );
+    }
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  DynRankView & operator = ( const View<RT,RP...> & rhs )
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Impl::ViewToDynRankViewTag >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" );
+      Mapping::assign( *this , rhs );
+      return *this ;
+    }
+
+  //----------------------------------------
+  // Allocation tracking properties
+
+  KOKKOS_INLINE_FUNCTION
+  int use_count() const
+    { return m_track.use_count(); }
+
+  inline
+  const std::string label() const
+    { return m_track.template get_label< typename traits::memory_space >(); }
+
+  //----------------------------------------
+  // Allocation according to allocation properties and array layout
+  // unused arg_layout dimensions must be set to ~size_t(0) so that rank deduction can properly take place
+  template< class ... P >
+  explicit inline
+  DynRankView( const Kokkos::Impl::ViewCtorProp< P ... > & arg_prop
+             , typename std::enable_if< ! Kokkos::Impl::ViewCtorProp< P... >::has_pointer
+                               , typename traits::array_layout
+                               >::type const & arg_layout
+      )
+      : m_track()
+      , m_map()
+      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
+    {
+      // Append layout and spaces if not input
+      typedef Kokkos::Impl::ViewCtorProp< P ... > alloc_prop_input ;
+
+      // use 'std::integral_constant<unsigned,I>' for non-types
+      // to avoid duplicate class error.
+      typedef Kokkos::Impl::ViewCtorProp
+        < P ...
+        , typename std::conditional
+            < alloc_prop_input::has_label
+            , std::integral_constant<unsigned,0>
+            , typename std::string
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_memory_space
+            , std::integral_constant<unsigned,1>
+            , typename traits::device_type::memory_space
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_execution_space
+            , std::integral_constant<unsigned,2>
+            , typename traits::device_type::execution_space
+            >::type
+        > alloc_prop ;
+
+      static_assert( traits::is_managed
+                   , "View allocation constructor requires managed memory" );
+
+      if ( alloc_prop::initialize &&
+           ! alloc_prop::execution_space::is_initialized() ) {
+        // If initializing view data then
+        // the execution space must be initialized.
+        Kokkos::Impl::throw_runtime_exception("Constructing DynRankView and initializing data with uninitialized execution space");
+      }
+
+      // Copy the input allocation properties with possibly defaulted properties
+      alloc_prop prop( arg_prop );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_ENABLE_CUDA )
+      // If allocating in CudaUVMSpace must fence before and after
+      // the allocation to protect against possible concurrent access
+      // on the CPU and the GPU.
+      // Fence using the trait's executon space (which will be Kokkos::Cuda)
+      // to avoid incomplete type errors from usng Kokkos::Cuda directly.
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
+
+      Kokkos::Impl::SharedAllocationRecord<> *
+        record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_ENABLE_CUDA )
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
+
+      // Setup and initialization complete, start tracking
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+
+
+  // Wrappers
+  template< class ... P >
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( const Kokkos::Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< Kokkos::Impl::ViewCtorProp< P... >::has_pointer
+                               , typename traits::array_layout
+                               >::type const & arg_layout
+      )
+      : m_track() // No memory tracking
+      , m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) )
+      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
+    {
+      static_assert(
+        std::is_same< pointer_type
+                    , typename Impl::ViewCtorProp< P... >::pointer_type
+                    >::value ,
+        "Constructing DynRankView to wrap user memory must supply matching pointer type" );
+    }
+
+  //----------------------------------------
+  //Constructor(s)
+
+  // Simple dimension-only layout
+  template< class ... P >
+  explicit inline
+  DynRankView( const Kokkos::Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< ! Kokkos::Impl::ViewCtorProp< P... >::has_pointer
+                               , size_t
+                               >::type const arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
+      )
+    : DynRankView( arg_prop
+    , typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3 , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+      )
+    {}
+
+  template< class ... P >
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( const Kokkos::Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< Kokkos::Impl::ViewCtorProp< P... >::has_pointer
+                               , size_t
+                               >::type const arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
+      )
+    : DynRankView( arg_prop
+    , typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3 , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+      )
+    {}
+
+  // Allocate with label and layout
+  template< typename Label >
+  explicit inline
+  DynRankView( const Label & arg_label
+      , typename std::enable_if<
+          Kokkos::Impl::is_view_label<Label>::value ,
+          typename traits::array_layout >::type const & arg_layout
+      )
+    : DynRankView( Kokkos::Impl::ViewCtorProp< std::string >( arg_label ) , arg_layout )
+    {}
+
+  // Allocate label and layout, must disambiguate from subview constructor
+  template< typename Label >
+  explicit inline
+  DynRankView( const Label & arg_label
+      , typename std::enable_if<
+          Kokkos::Impl::is_view_label<Label>::value ,
+        const size_t >::type arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
+      )
+    : DynRankView( Kokkos::Impl::ViewCtorProp< std::string >( arg_label )
+    , typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3 , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  // For backward compatibility
+  // NDE This ctor does not take ViewCtorProp argument - should not use alternative createLayout call
+  explicit inline
+  DynRankView( const ViewAllocateWithoutInitializing & arg_prop
+      , const typename traits::array_layout & arg_layout
+      )
+    : DynRankView( Kokkos::Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing )
+
+          , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout)
+      )
+    {}
+
+  explicit inline
+  DynRankView( const ViewAllocateWithoutInitializing & arg_prop
+      , const size_t arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
+      )
+    : DynRankView(Kokkos::Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing ), arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7 )
+    {}
+
+  //----------------------------------------
+  // Memory span required to wrap these dimensions.
+  static constexpr size_t required_allocation_size(
+                                       const size_t arg_N0 = 0
+                                     , const size_t arg_N1 = 0
+                                     , const size_t arg_N2 = 0
+                                     , const size_t arg_N3 = 0
+                                     , const size_t arg_N4 = 0
+                                     , const size_t arg_N5 = 0
+                                     , const size_t arg_N6 = 0
+                                     , const size_t arg_N7 = 0
+                                     )
+    {
+      return map_type::memory_span(
+        typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+          , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+    }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( pointer_type arg_ptr
+      , const size_t arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
+      )
+    : DynRankView( Kokkos::Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7 )
+    {}
+
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( pointer_type arg_ptr
+      , typename traits::array_layout & arg_layout
+      )
+    : DynRankView( Kokkos::Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_layout )
+    {}
+
+
+  //----------------------------------------
+  // Shared scratch memory constructor
+
+  static inline
+  size_t shmem_size( const size_t arg_N0 = ~size_t(0) ,
+                     const size_t arg_N1 = ~size_t(0) ,
+                     const size_t arg_N2 = ~size_t(0) ,
+                     const size_t arg_N3 = ~size_t(0) ,
+                     const size_t arg_N4 = ~size_t(0) ,
+                     const size_t arg_N5 = ~size_t(0) ,
+                     const size_t arg_N6 = ~size_t(0) ,
+                     const size_t arg_N7 = ~size_t(0) )
+  {
+    const size_t num_passed_args =
+      ( arg_N0 != ~size_t(0) ) + ( arg_N1 != ~size_t(0) ) + ( arg_N2 != ~size_t(0) ) +
+      ( arg_N3 != ~size_t(0) ) + ( arg_N4 != ~size_t(0) ) + ( arg_N5 != ~size_t(0) ) +
+      ( arg_N6 != ~size_t(0) ) + ( arg_N7 != ~size_t(0) );
+
+    if ( std::is_same<typename traits::specialize , void>::value && num_passed_args != traits::rank_dynamic ) {
+      Kokkos::abort( "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n" );
+    }
+    {}
+
+    return map_type::memory_span(
+           typename traits::array_layout
+            ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+            , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+  }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const typename traits::array_layout & arg_layout )
+    : DynRankView( Kokkos::Impl::ViewCtorProp<pointer_type>(
+              reinterpret_cast<pointer_type>(
+                arg_space.get_shmem( map_type::memory_span(
+                  Impl::DynRankDimTraits<typename traits::specialize>::createLayout( arg_layout ) //is this correct?
+                ) ) ) )
+         , arg_layout )
+     {}
+
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const size_t arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0) )
+
+    : DynRankView( Kokkos::Impl::ViewCtorProp<pointer_type>(
+                   reinterpret_cast<pointer_type>(
+                     arg_space.get_shmem(
+                       map_type::memory_span(
+                       Impl::DynRankDimTraits<typename traits::specialize>::createLayout(
+                       typename traits::array_layout
+                       ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+                       , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) )
+                    )
+                  , typename traits::array_layout
+                    ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+                    , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+        )
+    {}
+
+};
+
+
+  template < typename D , class ... P >
+  KOKKOS_INLINE_FUNCTION
+  constexpr unsigned rank( const DynRankView<D , P...> & DRV ) { return DRV.rank(); } //needed for transition to common constexpr method in view and dynrankview to return rank
+
+//----------------------------------------------------------------------------
+// Subview mapping.
+// Deduce destination view type from source view traits and subview arguments
+
+namespace Impl {
+
+struct DynRankSubviewTag {};
+
+} // namespace Impl
+
+namespace Impl {
+
+template< class SrcTraits , class ... Args >
+struct ViewMapping
+  < typename std::enable_if<(
+      std::is_same< typename SrcTraits::specialize , void >::value
+      &&
+      (
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutLeft >::value ||
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutRight >::value ||
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutStride >::value
+      )
+    ), Kokkos::Impl::DynRankSubviewTag >::type
+  , SrcTraits
+  , Args ... >
+{
+private:
+
+  enum
+    { RZ = false
+    , R0 = bool(is_integral_extent<0,Args...>::value)
+    , R1 = bool(is_integral_extent<1,Args...>::value)
+    , R2 = bool(is_integral_extent<2,Args...>::value)
+    , R3 = bool(is_integral_extent<3,Args...>::value)
+    , R4 = bool(is_integral_extent<4,Args...>::value)
+    , R5 = bool(is_integral_extent<5,Args...>::value)
+    , R6 = bool(is_integral_extent<6,Args...>::value)
+    };
+
+  enum { rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+              + unsigned(R4) + unsigned(R5) + unsigned(R6) };
+
+  typedef Kokkos::LayoutStride array_layout ;
+
+  typedef typename SrcTraits::value_type  value_type ;
+
+  typedef value_type******* data_type ;
+
+public:
+
+  typedef Kokkos::ViewTraits
+    < data_type
+    , array_layout
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > traits_type ;
+
+  typedef Kokkos::View
+    < data_type
+    , array_layout
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > type ;
+
+
+  template< class MemoryTraits >
+  struct apply {
+
+    static_assert( Kokkos::Impl::is_memory_traits< MemoryTraits >::value , "" );
+
+    typedef Kokkos::ViewTraits
+      < data_type
+      , array_layout
+      , typename SrcTraits::device_type
+      , MemoryTraits > traits_type ;
+
+    typedef Kokkos::View
+      < data_type
+      , array_layout
+      , typename SrcTraits::device_type
+      , MemoryTraits > type ;
+  };
+
+
+  typedef typename SrcTraits::dimension dimension ;
+
+  template < class Arg0 = int, class Arg1 = int, class Arg2 = int, class Arg3 = int, class Arg4 = int, class Arg5 = int, class Arg6 = int >
+  struct ExtentGenerator {
+    KOKKOS_INLINE_FUNCTION
+    static SubviewExtents< 7 , rank > generator ( const dimension & dim , Arg0 arg0 = Arg0(), Arg1 arg1 = Arg1(), Arg2 arg2 = Arg2(), Arg3 arg3 = Arg3(), Arg4 arg4 = Arg4(), Arg5 arg5 = Arg5(), Arg6 arg6 = Arg6() )
+    {
+       return SubviewExtents< 7 , rank>( dim , arg0 , arg1 , arg2 , arg3 , arg4 , arg5 , arg6 );
+    }
+  };
+
+
+  typedef Kokkos::DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits >  ret_type;
+
+  template < typename T , class ... P >
+  KOKKOS_INLINE_FUNCTION
+  static ret_type subview( const unsigned src_rank , Kokkos::DynRankView< T , P...> const & src
+                    , Args ... args )
+    {
+
+       typedef ViewMapping< traits_type, void >  DstType ;
+
+       typedef typename std::conditional< (rank==0) , ViewDimension<>
+                                                    , typename std::conditional< (rank==1) , ViewDimension<0>
+                                                    , typename std::conditional< (rank==2) , ViewDimension<0,0>
+                                                    , typename std::conditional< (rank==3) , ViewDimension<0,0,0>
+                                                    , typename std::conditional< (rank==4) , ViewDimension<0,0,0,0>
+                                                    , typename std::conditional< (rank==5) , ViewDimension<0,0,0,0,0>
+                                                    , typename std::conditional< (rank==6) , ViewDimension<0,0,0,0,0,0>
+                                                                                           , ViewDimension<0,0,0,0,0,0,0>
+                                                    >::type >::type >::type >::type >::type >::type >::type  DstDimType ;
+
+      typedef ViewOffset< DstDimType , Kokkos::LayoutStride > dst_offset_type ;
+      typedef typename DstType::handle_type  dst_handle_type ;
+
+      ret_type dst ;
+
+      const SubviewExtents< 7 , rank > extents =
+        ExtentGenerator< Args ... >::generator( src.m_map.m_offset.m_dim , args... ) ;
+
+      dst_offset_type tempdst( src.m_map.m_offset , extents ) ;
+
+      dst.m_track = src.m_track ;
+
+      dst.m_map.m_offset.m_dim.N0 = tempdst.m_dim.N0 ;
+      dst.m_map.m_offset.m_dim.N1 = tempdst.m_dim.N1 ;
+      dst.m_map.m_offset.m_dim.N2 = tempdst.m_dim.N2 ;
+      dst.m_map.m_offset.m_dim.N3 = tempdst.m_dim.N3 ;
+      dst.m_map.m_offset.m_dim.N4 = tempdst.m_dim.N4 ;
+      dst.m_map.m_offset.m_dim.N5 = tempdst.m_dim.N5 ;
+      dst.m_map.m_offset.m_dim.N6 = tempdst.m_dim.N6 ;
+
+      dst.m_map.m_offset.m_stride.S0 = tempdst.m_stride.S0 ;
+      dst.m_map.m_offset.m_stride.S1 = tempdst.m_stride.S1 ;
+      dst.m_map.m_offset.m_stride.S2 = tempdst.m_stride.S2 ;
+      dst.m_map.m_offset.m_stride.S3 = tempdst.m_stride.S3 ;
+      dst.m_map.m_offset.m_stride.S4 = tempdst.m_stride.S4 ;
+      dst.m_map.m_offset.m_stride.S5 = tempdst.m_stride.S5 ;
+      dst.m_map.m_offset.m_stride.S6 = tempdst.m_stride.S6 ;
+
+      dst.m_map.m_handle = dst_handle_type( src.m_map.m_handle +
+                                      src.m_map.m_offset( extents.domain_offset(0)
+                                                  , extents.domain_offset(1)
+                                                  , extents.domain_offset(2)
+                                                  , extents.domain_offset(3)
+                                                  , extents.domain_offset(4)
+                                                  , extents.domain_offset(5)
+                                                  , extents.domain_offset(6)
+                                                  ) );
+
+      dst.m_rank = ( src_rank > 0 ? unsigned(R0) : 0 )
+                 + ( src_rank > 1 ? unsigned(R1) : 0 )
+                 + ( src_rank > 2 ? unsigned(R2) : 0 )
+                 + ( src_rank > 3 ? unsigned(R3) : 0 )
+                 + ( src_rank > 4 ? unsigned(R4) : 0 )
+                 + ( src_rank > 5 ? unsigned(R5) : 0 )
+                 + ( src_rank > 6 ? unsigned(R6) : 0 ) ;
+
+      return dst ;
+    }
+};
+
+} // end Impl
+
+
+template< class V , class ... Args >
+using Subdynrankview = typename Kokkos::Impl::ViewMapping< Kokkos::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
+
+template< class D , class ... P , class ...Args >
+KOKKOS_INLINE_FUNCTION
+Subdynrankview< ViewTraits<D******* , P...> , Args... >
+subdynrankview( const Kokkos::DynRankView< D , P... > &src , Args...args)
+  {
+    if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
+      { Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }
+
+    typedef Kokkos::Impl::ViewMapping< Kokkos::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
+
+    return metafcn::subview( src.rank() , src , args... );
+  }
+
+//Wrapper to allow subview function name
+template< class D , class ... P , class ...Args >
+KOKKOS_INLINE_FUNCTION
+Subdynrankview< ViewTraits<D******* , P...> , Args... >
+subview( const Kokkos::DynRankView< D , P... > &src , Args...args)
+  {
+    return subdynrankview( src , args... );
+  }
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+// overload == and !=
+template< class LT , class ... LP , class RT , class ... RP >
+KOKKOS_INLINE_FUNCTION
+bool operator == ( const DynRankView<LT,LP...> & lhs ,
+                   const DynRankView<RT,RP...> & rhs )
+{
+  // Same data, layout, dimensions
+  typedef ViewTraits<LT,LP...>  lhs_traits ;
+  typedef ViewTraits<RT,RP...>  rhs_traits ;
+
+  return
+    std::is_same< typename lhs_traits::const_value_type ,
+                  typename rhs_traits::const_value_type >::value &&
+    std::is_same< typename lhs_traits::array_layout ,
+                  typename rhs_traits::array_layout >::value &&
+    std::is_same< typename lhs_traits::memory_space ,
+                  typename rhs_traits::memory_space >::value &&
+    lhs.rank()       ==  rhs.rank() &&
+    lhs.data()       == rhs.data() &&
+    lhs.span()       == rhs.span() &&
+    lhs.dimension(0) == rhs.dimension(0) &&
+    lhs.dimension(1) == rhs.dimension(1) &&
+    lhs.dimension(2) == rhs.dimension(2) &&
+    lhs.dimension(3) == rhs.dimension(3) &&
+    lhs.dimension(4) == rhs.dimension(4) &&
+    lhs.dimension(5) == rhs.dimension(5) &&
+    lhs.dimension(6) == rhs.dimension(6) &&
+    lhs.dimension(7) == rhs.dimension(7);
+}
+
+template< class LT , class ... LP , class RT , class ... RP >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const DynRankView<LT,LP...> & lhs ,
+                   const DynRankView<RT,RP...> & rhs )
+{
+  return ! ( operator==(lhs,rhs) );
+}
+
+} //end Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+namespace Kokkos {
+namespace Impl {
+
+template< class OutputView , typename Enable = void >
+struct DynRankViewFill {
+
+  typedef typename OutputView::traits::const_value_type  const_value_type ;
+
+  const OutputView output ;
+  const_value_type input ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    const size_t n1 = output.dimension_1();
+    const size_t n2 = output.dimension_2();
+    const size_t n3 = output.dimension_3();
+    const size_t n4 = output.dimension_4();
+    const size_t n5 = output.dimension_5();
+    const size_t n6 = output.dimension_6();
+
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+      output.access(i0,i1,i2,i3,i4,i5,i6) = input ;
+    }}}}}}
+  }
+
+  DynRankViewFill( const OutputView & arg_out , const_value_type & arg_in )
+    : output( arg_out ), input( arg_in )
+    {
+      typedef typename OutputView::execution_space  execution_space ;
+      typedef Kokkos::RangePolicy< execution_space > Policy ;
+
+      const Kokkos::Impl::ParallelFor< DynRankViewFill , Policy > closure( *this , Policy( 0 , output.dimension_0() ) );
+
+      closure.execute();
+
+      execution_space::fence();
+    }
+};
+
+template< class OutputView >
+struct DynRankViewFill< OutputView , typename std::enable_if< OutputView::Rank == 0 >::type > {
+  DynRankViewFill( const OutputView & dst , const typename OutputView::const_value_type & src )
+    {
+      Kokkos::Impl::DeepCopy< typename OutputView::memory_space , Kokkos::HostSpace >
+        ( dst.data() , & src , sizeof(typename OutputView::const_value_type) );
+    }
+};
+
+template< class OutputView , class InputView , class ExecSpace = typename OutputView::execution_space >
+struct DynRankViewRemap {
+
+  const OutputView output ;
+  const InputView  input ;
+  const size_t n0 ;
+  const size_t n1 ;
+  const size_t n2 ;
+  const size_t n3 ;
+  const size_t n4 ;
+  const size_t n5 ;
+  const size_t n6 ;
+  const size_t n7 ;
+
+  DynRankViewRemap( const OutputView & arg_out , const InputView & arg_in )
+    : output( arg_out ), input( arg_in )
+    , n0( std::min( (size_t)arg_out.extent(0) , (size_t)arg_in.extent(0) ) )
+    , n1( std::min( (size_t)arg_out.extent(1) , (size_t)arg_in.extent(1) ) )
+    , n2( std::min( (size_t)arg_out.extent(2) , (size_t)arg_in.extent(2) ) )
+    , n3( std::min( (size_t)arg_out.extent(3) , (size_t)arg_in.extent(3) ) )
+    , n4( std::min( (size_t)arg_out.extent(4) , (size_t)arg_in.extent(4) ) )
+    , n5( std::min( (size_t)arg_out.extent(5) , (size_t)arg_in.extent(5) ) )
+    , n6( std::min( (size_t)arg_out.extent(6) , (size_t)arg_in.extent(6) ) )
+    , n7( std::min( (size_t)arg_out.extent(7) , (size_t)arg_in.extent(7) ) )
+    {
+      typedef Kokkos::RangePolicy< ExecSpace > Policy ;
+      const Kokkos::Impl::ParallelFor< DynRankViewRemap , Policy > closure( *this , Policy( 0 , n0 ) );
+      closure.execute();
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+      output.access(i0,i1,i2,i3,i4,i5,i6) = input.access(i0,i1,i2,i3,i4,i5,i6);
+    }}}}}}
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+
+namespace Kokkos {
+
+/** \brief  Deep copy a value from Host memory into a view.  */
+template< class DT , class ... DP >
+inline
+void deep_copy
+  ( const DynRankView<DT,DP...> & dst
+  , typename ViewTraits<DT,DP...>::const_value_type & value
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type ,
+                  typename ViewTraits<DT,DP...>::value_type >::value
+    , "deep_copy requires non-const type" );
+
+  Kokkos::Impl::DynRankViewFill< DynRankView<DT,DP...> >( dst , value );
+}
+
+/** \brief  Deep copy into a value in Host memory from a view.  */
+template< class ST , class ... SP >
+inline
+void deep_copy
+  ( typename ViewTraits<ST,SP...>::non_const_value_type & dst
+  , const DynRankView<ST,SP...> & src
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  if ( src.rank() != 0 )
+  {
+    Kokkos::abort("");
+  }
+
+  typedef ViewTraits<ST,SP...>               src_traits ;
+  typedef typename src_traits::memory_space  src_memory_space ;
+  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) );
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of the default specialization, compatible type,
+ *          same rank, same contiguous layout.
+ */
+template< class DstType , class SrcType >
+inline
+void deep_copy
+  ( const DstType & dst
+  , const SrcType & src
+  , typename std::enable_if<(
+    std::is_same< typename DstType::traits::specialize , void >::value &&
+    std::is_same< typename SrcType::traits::specialize , void >::value
+    &&
+    ( Kokkos::is_dyn_rank_view<DstType>::value || Kokkos::is_dyn_rank_view<SrcType>::value)
+  )>::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename DstType::traits::value_type ,
+                  typename DstType::traits::non_const_value_type >::value
+    , "deep_copy requires non-const destination type" );
+
+  typedef DstType  dst_type ;
+  typedef SrcType  src_type ;
+
+  typedef typename dst_type::execution_space  dst_execution_space ;
+  typedef typename src_type::execution_space  src_execution_space ;
+  typedef typename dst_type::memory_space     dst_memory_space ;
+  typedef typename src_type::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
+
+  enum { SrcExecCanAccessDst =
+   Kokkos::Impl::SpaceAccessibility< src_execution_space , dst_memory_space >::accessible };
+
+  if ( (void *) dst.data() != (void*) src.data() ) {
+
+    // Concern: If overlapping views then a parallel copy will be erroneous.
+    // ...
+
+    // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
+    if ( rank(src) == 0 && rank(dst) == 0 )
+    {
+      typedef typename dst_type::value_type    value_type ;
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) );
+    }
+    else if ( std::is_same< typename DstType::traits::value_type ,
+                       typename SrcType::traits::non_const_value_type >::value &&
+         (
+           ( std::is_same< typename DstType::traits::array_layout ,
+                           typename SrcType::traits::array_layout >::value
+             &&
+             ( std::is_same< typename DstType::traits::array_layout ,
+                             typename Kokkos::LayoutLeft>::value
+             ||
+               std::is_same< typename DstType::traits::array_layout ,
+                             typename Kokkos::LayoutRight>::value
+             )
+           )
+           ||
+           (
+             rank(dst) == 1
+             &&
+             rank(src) == 1
+           )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.extent(0) == src.extent(0) &&
+
+         dst.extent(1) == src.extent(1) &&
+         dst.extent(2) == src.extent(2) &&
+         dst.extent(3) == src.extent(3) &&
+         dst.extent(4) == src.extent(4) &&
+         dst.extent(5) == src.extent(5) &&
+         dst.extent(6) == src.extent(6) &&
+         dst.extent(7) == src.extent(7) ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
+    }
+    else if ( std::is_same< typename DstType::traits::value_type ,
+                            typename SrcType::traits::non_const_value_type >::value &&
+         (
+           ( std::is_same< typename DstType::traits::array_layout ,
+                           typename SrcType::traits::array_layout >::value
+             &&
+             std::is_same< typename DstType::traits::array_layout ,
+                          typename Kokkos::LayoutStride>::value
+           )
+           ||
+           (
+             rank(dst) == 1
+             &&
+             rank(src) == 1
+           )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.extent(0) == src.extent(0) &&
+         dst.extent(1) == src.extent(1) &&
+         dst.extent(2) == src.extent(2) &&
+         dst.extent(3) == src.extent(3) &&
+         dst.extent(4) == src.extent(4) &&
+         dst.extent(5) == src.extent(5) &&
+         dst.extent(6) == src.extent(6) &&
+         dst.extent(7) == src.extent(7) &&
+         dst.stride_0() == src.stride_0() &&
+         dst.stride_1() == src.stride_1() &&
+         dst.stride_2() == src.stride_2() &&
+         dst.stride_3() == src.stride_3() &&
+         dst.stride_4() == src.stride_4() &&
+         dst.stride_5() == src.stride_5() &&
+         dst.stride_6() == src.stride_6() &&
+         dst.stride_7() == src.stride_7()
+         ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
+    }
+    else if ( DstExecCanAccessSrc ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Impl::DynRankViewRemap< dst_type , src_type >( dst , src );
+    }
+    else if ( SrcExecCanAccessDst ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Impl::DynRankViewRemap< dst_type , src_type , src_execution_space >( dst , src );
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+    }
+  }
+}
+
+} //end Kokkos
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+
+// Deduce Mirror Types
+template<class Space, class T, class ... P>
+struct MirrorDRViewType {
+  // The incoming view_type
+  typedef typename Kokkos::DynRankView<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::DynRankView<data_type,array_layout,Space> dest_view_type;
+  // If it is the same memory_space return the existsing view_type
+  // This will also keep the unmanaged trait if necessary
+  typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type;
+};
+
+template<class Space, class T, class ... P>
+struct MirrorDRVType {
+  // The incoming view_type
+  typedef typename Kokkos::DynRankView<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::DynRankView<data_type,array_layout,Space> view_type;
+};
+
+}
+
+template< class T , class ... P >
+inline
+typename DynRankView<T,P...>::HostMirror
+create_mirror( const DynRankView<T,P...> & src
+             , typename std::enable_if<
+                 ! std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
+                               , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef DynRankView<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  return dst_type( std::string( src.label() ).append("_mirror")
+                 , Impl::reconstructLayout(src.layout(), src.rank()) );
+}
+
+
+template< class T , class ... P >
+inline
+typename DynRankView<T,P...>::HostMirror
+create_mirror( const DynRankView<T,P...> & src
+             , typename std::enable_if<
+                 std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
+                             , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef DynRankView<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  return dst_type( std::string( src.label() ).append("_mirror")
+                 , Impl::reconstructLayout(src.layout(), src.rank()) );
+}
+
+
+// Create a mirror in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorDRVType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::DynRankView<T,P...> & src) {
+  return typename Impl::MirrorDRVType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
+}
+
+template< class T , class ... P >
+inline
+typename DynRankView<T,P...>::HostMirror
+create_mirror_view( const DynRankView<T,P...> & src
+                  , typename std::enable_if<(
+                      std::is_same< typename DynRankView<T,P...>::memory_space
+                                  , typename DynRankView<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename DynRankView<T,P...>::data_type
+                                  , typename DynRankView<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
+{
+  return src ;
+}
+
+template< class T , class ... P >
+inline
+typename DynRankView<T,P...>::HostMirror
+create_mirror_view( const DynRankView<T,P...> & src
+                  , typename std::enable_if< ! (
+                      std::is_same< typename DynRankView<T,P...>::memory_space
+                                  , typename DynRankView<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename DynRankView<T,P...>::data_type
+                                  , typename DynRankView<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
+{
+  return Kokkos::create_mirror( src );
+}
+
+// Create a mirror view in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::DynRankView<T,P...> & src
+  , typename std::enable_if<Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return src;
+}
+
+// Create a mirror view in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::DynRankView<T,P...> & src
+  , typename std::enable_if<!Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return typename Impl::MirrorDRViewType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
+}
+
+} //end Kokkos
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class ... P >
+inline
+void resize( DynRankView<T,P...> & v ,
+             const size_t n0 = ~size_t(0) ,
+             const size_t n1 = ~size_t(0) ,
+             const size_t n2 = ~size_t(0) ,
+             const size_t n3 = ~size_t(0) ,
+             const size_t n4 = ~size_t(0) ,
+             const size_t n5 = ~size_t(0) ,
+             const size_t n6 = ~size_t(0) ,
+             const size_t n7 = ~size_t(0) )
+{
+  typedef DynRankView<T,P...> drview_type ;
+
+  static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
+
+  drview_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6 );
+
+  Kokkos::Impl::DynRankViewRemap< drview_type , drview_type >( v_resized, v );
+
+  v = v_resized ;
+}
+
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class ... P >
+inline
+void realloc( DynRankView<T,P...> & v ,
+              const size_t n0 = ~size_t(0) ,
+              const size_t n1 = ~size_t(0) ,
+              const size_t n2 = ~size_t(0) ,
+              const size_t n3 = ~size_t(0) ,
+              const size_t n4 = ~size_t(0) ,
+              const size_t n5 = ~size_t(0) ,
+              const size_t n6 = ~size_t(0) ,
+              const size_t n7 = ~size_t(0) )
+{
+  typedef DynRankView<T,P...>  drview_type ;
+
+  static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
+
+  const std::string label = v.label();
+
+  v = drview_type(); // Deallocate first, if the only view to allocation
+  v = drview_type( label, n0, n1, n2, n3, n4, n5, n6 );
+}
+
+} //end Kokkos
+
+#endif
+
diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4c1e0ef72102131b9f1d5a2192e05451c873f1c7
--- /dev/null
+++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -0,0 +1,578 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_DYNAMIC_VIEW_HPP
+#define KOKKOS_DYNAMIC_VIEW_HPP
+
+#include <cstdio>
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+
+// Simple metafunction for choosing memory space
+// In the current implementation, if memory_space == CudaSpace, 
+// use CudaUVMSpace for the chunk 'array' allocation, which
+// contains will contain pointers to chunks of memory allocated 
+// in CudaSpace
+namespace Impl {
+template < class MemSpace >
+struct ChunkArraySpace {
+  using memory_space = MemSpace;
+};
+
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+struct ChunkArraySpace< Kokkos::CudaSpace > {
+  using memory_space = typename Kokkos::CudaUVMSpace;
+};
+#endif
+#ifdef KOKKOS_ENABLE_ROCM
+template <>
+struct ChunkArraySpace< Kokkos::Experimental::ROCmSpace > {
+  using memory_space = typename Kokkos::Experimental::ROCmHostPinnedSpace;
+};
+#endif
+} // end namespace Impl
+
+/** \brief Dynamic views are restricted to rank-one and no layout.
+ *         Resize only occurs on host outside of parallel_regions.
+ *         Subviews are not allowed.
+ */
+template< typename DataType , typename ... P >
+class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
+{
+public:
+
+  typedef Kokkos::ViewTraits< DataType , P ... >  traits ;
+
+private:
+
+  template< class , class ... > friend class DynamicView ;
+
+  typedef Kokkos::Impl::SharedAllocationTracker   track_type ;
+
+  static_assert( traits::rank == 1 && traits::rank_dynamic == 1
+               , "DynamicView must be rank-one" );
+
+  static_assert( std::is_trivial< typename traits::value_type >::value &&
+                 std::is_same< typename traits::specialize , void >::value &&
+                 Kokkos::Impl::is_power_of_two
+                   <sizeof(typename traits::value_type)>::value
+               , "DynamicView must have trivial value_type and sizeof(value_type) is a power-of-two");
+
+
+  template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space
+    { KOKKOS_FORCEINLINE_FUNCTION static void check() {} };
+
+  template< class Space > struct verify_space<Space,false>
+    { KOKKOS_FORCEINLINE_FUNCTION static void check()
+        { Kokkos::abort("Kokkos::DynamicView ERROR: attempt to access inaccessible memory space"); };
+    };
+
+private:
+
+  track_type                     m_track ;
+  typename traits::value_type ** m_chunks ;      // array of pointers to 'chunks' of memory
+  unsigned                       m_chunk_shift ; // ceil(log2(m_chunk_size))
+  unsigned                       m_chunk_mask ;  // m_chunk_size - 1
+  unsigned                       m_chunk_max ;   // number of entries in the chunk array - each pointing to a chunk of extent == m_chunk_size entries
+  unsigned                       m_chunk_size ;  // 2 << (m_chunk_shift - 1)
+
+public:
+
+  //----------------------------------------------------------------------
+
+  /** \brief  Compatible view of array of scalar types */
+  typedef DynamicView< typename traits::data_type ,
+                       typename traits::device_type >
+    array_type ;
+
+  /** \brief  Compatible view of const data type */
+  typedef DynamicView< typename traits::const_data_type ,
+                       typename traits::device_type >
+    const_type ;
+
+  /** \brief  Compatible view of non-const data type */
+  typedef DynamicView< typename traits::non_const_data_type ,
+                       typename traits::device_type >
+    non_const_type ;
+
+  /** \brief  Must be accessible everywhere */
+  typedef DynamicView  HostMirror ;
+
+  //----------------------------------------------------------------------
+
+  enum { Rank = 1 };
+
+  KOKKOS_INLINE_FUNCTION
+  size_t allocation_extent() const noexcept
+    {
+      uintptr_t n = *reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max );
+      return (n << m_chunk_shift);
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t chunk_size() const noexcept
+    {
+      return m_chunk_size;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t size() const noexcept
+    {
+      size_t extent_0 = *reinterpret_cast<const size_t*>( m_chunks + m_chunk_max +1 );
+      return extent_0;
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  size_t extent( const iType & r ) const
+    { return r == 0 ? size() : 1 ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  size_t extent_int( const iType & r ) const
+    { return r == 0 ? size() : 1 ; }
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+  KOKKOS_INLINE_FUNCTION size_t dimension_0() const { return size(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return 1 ; }
+#endif
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return 0 ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { *s = 0 ; }
+
+  //----------------------------------------
+  // Allocation tracking properties
+
+  KOKKOS_INLINE_FUNCTION
+  int use_count() const
+    { return m_track.use_count(); }
+
+  inline
+  const std::string label() const
+    { return m_track.template get_label< typename traits::memory_space >(); }
+
+  //----------------------------------------------------------------------
+  // Range span is the span which contains all members.
+
+  typedef typename traits::value_type &  reference_type ;
+  typedef typename traits::value_type *  pointer_type ;
+
+  enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value };
+
+  KOKKOS_INLINE_FUNCTION constexpr bool   span_is_contiguous() const { return false ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return 0 ; }
+
+  //----------------------------------------
+
+  template< typename I0 , class ... Args >
+  KOKKOS_INLINE_FUNCTION
+  reference_type operator()( const I0 & i0 , const Args & ... args ) const
+    {
+      static_assert( Kokkos::Impl::are_integral<I0,Args...>::value
+                   , "Indices must be integral type" );
+
+      DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
+
+      // Which chunk is being indexed.
+      const uintptr_t ic = uintptr_t( i0 >> m_chunk_shift );
+
+      typename traits::value_type * volatile * const ch = m_chunks + ic ;
+
+      // Do bounds checking if enabled or if the chunk pointer is zero.
+      // If not bounds checking then we assume a non-zero pointer is valid.
+
+#if ! defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      if ( 0 == *ch )
+#endif
+      {
+        // Verify that allocation of the requested chunk in in progress.
+
+        // The allocated chunk counter is m_chunks[ m_chunk_max ]
+        const uintptr_t n =
+          *reinterpret_cast<uintptr_t volatile *>( m_chunks + m_chunk_max );
+
+        if ( n <= ic ) {
+          Kokkos::abort("Kokkos::DynamicView array bounds error");
+        }
+
+        // Allocation of this chunk is in progress
+        // so wait for allocation to complete.
+        while ( 0 == *ch );
+      }
+
+      return (*ch)[ i0 & m_chunk_mask ];
+    }
+
+  //----------------------------------------
+  /** \brief  Resizing in serial can grow or shrink the array size
+   *          up to the maximum number of chunks
+   * */
+  template< typename IntType >
+  inline
+  typename std::enable_if
+    < std::is_integral<IntType>::value &&
+      Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace
+                                     , typename Impl::ChunkArraySpace< typename traits::memory_space >::memory_space 
+                                     >::accessible
+    >::type
+  resize_serial( IntType const & n )
+    {
+      typedef typename traits::value_type value_type ;
+      typedef value_type * pointer_type ;
+
+      const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ; // New total number of chunks needed for resize
+
+      if ( m_chunk_max < NC ) {
+        Kokkos::abort("DynamicView::resize_serial exceeded maximum size");
+      }
+
+      // *m_chunks[m_chunk_max] stores the current number of chunks being used
+      uintptr_t * const pc =
+        reinterpret_cast<uintptr_t*>( m_chunks + m_chunk_max );
+
+      if ( *pc < NC ) {
+        while ( *pc < NC ) {
+          m_chunks[*pc] = reinterpret_cast<pointer_type>
+            ( 
+             typename traits::memory_space().allocate( sizeof(value_type) << m_chunk_shift ) 
+            );
+          ++*pc ;
+        }
+      }
+      else {
+        while ( NC + 1 <= *pc ) {
+          --*pc ;
+          typename traits::memory_space().deallocate( m_chunks[*pc]
+                                         , sizeof(value_type) << m_chunk_shift );
+          m_chunks[*pc] = 0 ;
+        }
+      }
+      // *m_chunks[m_chunk_max+1] stores the 'extent' requested by resize
+      *(pc+1) = n;
+    }
+
+  //----------------------------------------------------------------------
+
+  ~DynamicView() = default ;
+  DynamicView() = default ;
+  DynamicView( DynamicView && ) = default ;
+  DynamicView( const DynamicView & ) = default ;
+  DynamicView & operator = ( DynamicView && ) = default ;
+  DynamicView & operator = ( const DynamicView & ) = default ;
+
+  template< class RT , class ... RP >
+  DynamicView( const DynamicView<RT,RP...> & rhs )
+    : m_track( rhs.m_track )
+    , m_chunks( (typename traits::value_type **) rhs.m_chunks )
+    , m_chunk_shift( rhs.m_chunk_shift )
+    , m_chunk_mask( rhs.m_chunk_mask )
+    , m_chunk_max( rhs.m_chunk_max )
+    , m_chunk_size( rhs.m_chunk_size )
+    {
+      typedef typename DynamicView<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynamicView copy construction" );
+    }
+
+  //----------------------------------------------------------------------
+
+  struct Destroy {
+    typename traits::value_type ** m_chunks ;
+    unsigned                       m_chunk_max ;
+    bool                           m_destroy ;
+    unsigned                       m_chunk_size ;
+
+    // Initialize or destroy array of chunk pointers.
+    // Two entries beyond the max chunks are allocation counters.
+    inline
+    void operator()( unsigned i ) const
+      {
+        if ( m_destroy && i < m_chunk_max && 0 != m_chunks[i] ) {
+          typename traits::memory_space().deallocate( m_chunks[i], m_chunk_size );
+        }
+        m_chunks[i] = 0 ;
+      }
+
+    void execute( bool arg_destroy )
+      {
+        typedef Kokkos::RangePolicy< typename HostSpace::execution_space > Range ;
+        //typedef Kokkos::RangePolicy< typename Impl::ChunkArraySpace< typename traits::memory_space >::memory_space::execution_space > Range ;
+
+        m_destroy = arg_destroy ;
+
+        Kokkos::Impl::ParallelFor<Destroy,Range>
+          closure( *this , Range(0, m_chunk_max + 2) ); // Add 2 to 'destroy' extra slots storing num_chunks and extent; previously + 1
+
+        closure.execute();
+
+        traits::execution_space::fence();
+        //Impl::ChunkArraySpace< typename traits::memory_space >::memory_space::execution_space::fence(); 
+      }
+
+    void construct_shared_allocation()
+      { execute( false ); }
+
+    void destroy_shared_allocation()
+      { execute( true ); }
+
+    Destroy() = default ;
+    Destroy( Destroy && ) = default ;
+    Destroy( const Destroy & ) = default ;
+    Destroy & operator = ( Destroy && ) = default ;
+    Destroy & operator = ( const Destroy & ) = default ;
+
+    Destroy( typename traits::value_type ** arg_chunk
+           , const unsigned arg_chunk_max 
+           , const unsigned arg_chunk_size )
+     : m_chunks( arg_chunk )
+     , m_chunk_max( arg_chunk_max )
+     , m_destroy( false )
+     , m_chunk_size( arg_chunk_size )
+     {}
+  };
+
+
+  /**\brief  Allocation constructor
+   *
+   *  Memory is allocated in chunks
+   *  A maximum size is required in order to allocate a
+   *  chunk-pointer array.
+   */
+  explicit inline
+  DynamicView( const std::string & arg_label
+             , const unsigned min_chunk_size
+             , const unsigned max_extent ) 
+    : m_track()
+    , m_chunks(0)
+    // The chunk size is guaranteed to be a power of two
+    , m_chunk_shift(
+        Kokkos::Impl::integral_power_of_two_that_contains( min_chunk_size ) ) // div ceil(log2(min_chunk_size))
+    , m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )                              // mod
+    , m_chunk_max( ( max_extent + m_chunk_mask ) >> m_chunk_shift )           // max num pointers-to-chunks in array
+    , m_chunk_size ( 2 << (m_chunk_shift - 1) )
+    {
+      typedef typename Impl::ChunkArraySpace< typename traits::memory_space >::memory_space chunk_array_memory_space;
+      // A functor to deallocate all of the chunks upon final destruction
+      typedef Kokkos::Impl::SharedAllocationRecord< chunk_array_memory_space , Destroy > record_type ;
+
+      // Allocate chunk pointers and allocation counter
+      record_type * const record =
+        record_type::allocate( chunk_array_memory_space()
+                             , arg_label
+                             , ( sizeof(pointer_type) * ( m_chunk_max + 2 ) ) ); 
+      // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] == num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent
+      // This must match in Destroy's execute(...) method
+
+      m_chunks = reinterpret_cast<pointer_type*>( record->data() );
+
+      record->m_destroy = Destroy( m_chunks , m_chunk_max, m_chunk_size );
+
+      // Initialize to zero
+      record->m_destroy.construct_shared_allocation();
+
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+
+};
+
+} // namespace Experimental
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template< class T , class ... P >
+inline
+typename Kokkos::Experimental::DynamicView<T,P...>::HostMirror
+create_mirror_view( const Kokkos::Experimental::DynamicView<T,P...> & src )
+{
+  return src ;
+}
+
+template< class T , class ... DP , class ... SP >
+inline
+void deep_copy( const View<T,DP...> & dst
+              , const Kokkos::Experimental::DynamicView<T,SP...> & src
+              )
+{
+  typedef View<T,DP...>        dst_type ;
+  typedef Kokkos::Experimental::DynamicView<T,SP...> src_type ;
+
+  typedef typename ViewTraits<T,DP...>::execution_space  dst_execution_space ;
+  typedef typename ViewTraits<T,SP...>::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
+
+  if ( DstExecCanAccessSrc ) {
+    // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+    Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+  }
+}
+
+template< class T , class ... DP , class ... SP >
+inline
+void deep_copy( const Kokkos::Experimental::DynamicView<T,DP...> & dst
+              , const View<T,SP...> & src
+              )
+{
+  typedef Kokkos::Experimental::DynamicView<T,SP...> dst_type ;
+  typedef View<T,DP...>        src_type ;
+
+  typedef typename ViewTraits<T,DP...>::execution_space  dst_execution_space ;
+  typedef typename ViewTraits<T,SP...>::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
+
+  if ( DstExecCanAccessSrc ) {
+    // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+    Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+  }
+}
+
+namespace Impl {
+template<class Arg0, class ... DP , class ... SP>
+struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>,Kokkos::Experimental::DynamicView<SP...>,1,Arg0> {
+  typedef Kokkos::Experimental::DynamicView<DP...> DstType;
+  typedef Kokkos::Experimental::DynamicView<SP...> SrcType;
+  typedef DstType dst_subview_type;
+  typedef SrcType src_subview_type;
+  dst_subview_type dst_sub;
+  src_subview_type src_sub;
+  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0):
+    dst_sub(dst),src_sub(src) {}
+};
+
+template<class ...DP, class SrcType, class Arg0>
+struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>,SrcType,1,Arg0> {
+  typedef Kokkos::Experimental::DynamicView<DP...> DstType;
+  typedef DstType dst_subview_type;
+  typedef typename Kokkos::Subview<SrcType,Arg0> src_subview_type;
+  dst_subview_type dst_sub;
+  src_subview_type src_sub;
+  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0):
+    dst_sub(dst),src_sub(src,arg0) {}
+};
+
+template<class DstType, class ...SP, class Arg0>
+struct CommonSubview<DstType,Kokkos::Experimental::DynamicView<SP...>,1,Arg0> {
+  typedef Kokkos::Experimental::DynamicView<SP...> SrcType;
+  typedef typename Kokkos::Subview<DstType,Arg0> dst_subview_type;
+  typedef SrcType src_subview_type;
+  dst_subview_type dst_sub;
+  src_subview_type src_sub;
+  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0):
+    dst_sub(dst,arg0),src_sub(src) {}
+};
+
+template<class ...DP,class ViewTypeB, class Layout, class ExecSpace,typename iType>
+struct ViewCopy<Kokkos::Experimental::DynamicView<DP...>,ViewTypeB,Layout,ExecSpace,1,iType> {
+  Kokkos::Experimental::DynamicView<DP...> a;
+  ViewTypeB b;
+
+  typedef Kokkos::RangePolicy<ExecSpace,Kokkos::IndexType<iType>> policy_type;
+
+  ViewCopy(const Kokkos::Experimental::DynamicView<DP...>& a_, const ViewTypeB& b_):a(a_),b(b_) {
+    Kokkos::parallel_for("Kokkos::ViewCopy-2D",
+       policy_type(0,b.extent(0)),*this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0) const {
+      a(i0) = b(i0);
+  };
+};
+
+template<class ...DP,class ...SP, class Layout, class ExecSpace,typename iType>
+struct ViewCopy<Kokkos::Experimental::DynamicView<DP...>,
+                Kokkos::Experimental::DynamicView<SP...>,Layout,ExecSpace,1,iType> {
+  Kokkos::Experimental::DynamicView<DP...> a;
+  Kokkos::Experimental::DynamicView<SP...> b;
+
+  typedef Kokkos::RangePolicy<ExecSpace,Kokkos::IndexType<iType>> policy_type;
+
+  ViewCopy(const Kokkos::Experimental::DynamicView<DP...>& a_,
+           const Kokkos::Experimental::DynamicView<SP...>& b_):a(a_),b(b_) {
+    const iType n = std::min(a.extent(0),b.extent(0));
+    Kokkos::parallel_for("Kokkos::ViewCopy-2D",
+       policy_type(0,n),*this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0) const {
+      a(i0) = b(i0);
+  };
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_DYNAMIC_VIEW_HPP */
+
diff --git a/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp b/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e05ea1a9d5be4a550ed177348fe4d4c62289ace4
--- /dev/null
+++ b/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp
@@ -0,0 +1,197 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP
+#define KOKKOS_EXPERIMENTAL_ERROR_REPORTER_HPP
+
+#include <vector>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_DualView.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+
+template <typename ReportType, typename DeviceType>
+class ErrorReporter
+{
+public:
+
+  typedef ReportType                                      report_type;
+  typedef DeviceType                                      device_type;
+  typedef typename device_type::execution_space           execution_space;
+
+  ErrorReporter(int max_results)
+    : m_numReportsAttempted(""),
+      m_reports("", max_results),
+      m_reporters("", max_results)
+  {
+    clear();
+  }
+
+  int getCapacity() const { return m_reports.h_view.extent(0); }
+
+  int getNumReports();
+
+  int getNumReportAttempts();
+
+  void getReports(std::vector<int> &reporters_out, std::vector<report_type> &reports_out);
+  void getReports( typename Kokkos::View<int*, typename DeviceType::execution_space >::HostMirror &reporters_out,
+                   typename Kokkos::View<report_type*, typename DeviceType::execution_space >::HostMirror &reports_out);
+
+  void clear();
+
+  void resize(const size_t new_size);
+
+  bool full() {return (getNumReportAttempts() >= getCapacity()); }
+
+  KOKKOS_INLINE_FUNCTION
+  bool add_report(int reporter_id, report_type report) const
+  {
+    int idx = Kokkos::atomic_fetch_add(&m_numReportsAttempted(), 1);
+
+    if (idx >= 0 && (idx < static_cast<int>(m_reports.d_view.extent(0)))) {
+      m_reporters.d_view(idx) = reporter_id;
+      m_reports.d_view(idx)   = report;
+      return true;
+    }
+    else {
+      return false;
+    }
+  }
+
+private:
+
+  typedef Kokkos::View<report_type *, execution_space>        reports_view_t;
+  typedef Kokkos::DualView<report_type *, execution_space>    reports_dualview_t;
+
+  typedef typename reports_dualview_t::host_mirror_space  host_mirror_space;
+  Kokkos::View<int, execution_space>   m_numReportsAttempted;
+  reports_dualview_t                   m_reports;
+  Kokkos::DualView<int *, execution_space> m_reporters;
+
+};
+
+
+template <typename ReportType, typename DeviceType>
+inline int ErrorReporter<ReportType, DeviceType>::getNumReports()
+{
+  int num_reports = 0;
+  Kokkos::deep_copy(num_reports,m_numReportsAttempted);
+  if (num_reports > static_cast<int>(m_reports.h_view.extent(0))) {
+    num_reports = m_reports.h_view.extent(0);
+  }
+  return num_reports;
+}
+
+template <typename ReportType, typename DeviceType>
+inline int ErrorReporter<ReportType, DeviceType>::getNumReportAttempts()
+{
+  int num_reports = 0;
+  Kokkos::deep_copy(num_reports,m_numReportsAttempted);
+  return num_reports;
+}
+
+template <typename ReportType, typename DeviceType>
+void ErrorReporter<ReportType, DeviceType>::getReports(std::vector<int> &reporters_out, std::vector<report_type> &reports_out)
+{
+  int num_reports = getNumReports();
+  reporters_out.clear();
+  reporters_out.reserve(num_reports);
+  reports_out.clear();
+  reports_out.reserve(num_reports);
+
+  if (num_reports > 0) {
+    m_reports.template sync<host_mirror_space>();
+    m_reporters.template sync<host_mirror_space>();
+
+    for (int i = 0; i < num_reports; ++i) {
+      reporters_out.push_back(m_reporters.h_view(i));
+      reports_out.push_back(m_reports.h_view(i));
+    }
+  }
+}
+
+template <typename ReportType, typename DeviceType>
+void ErrorReporter<ReportType, DeviceType>::getReports(
+    typename Kokkos::View<int*, typename DeviceType::execution_space >::HostMirror &reporters_out,
+    typename Kokkos::View<report_type*, typename DeviceType::execution_space >::HostMirror &reports_out)
+{
+  int num_reports = getNumReports();
+  reporters_out = typename Kokkos::View<int*, typename DeviceType::execution_space >::HostMirror("ErrorReport::reporters_out",num_reports);
+  reports_out = typename Kokkos::View<report_type*, typename DeviceType::execution_space >::HostMirror("ErrorReport::reports_out",num_reports);
+
+  if (num_reports > 0) {
+    m_reports.template sync<host_mirror_space>();
+    m_reporters.template sync<host_mirror_space>();
+
+    for (int i = 0; i < num_reports; ++i) {
+      reporters_out(i) = m_reporters.h_view(i);
+      reports_out(i) = m_reports.h_view(i);
+    }
+  }
+}
+
+template <typename ReportType, typename DeviceType>
+void ErrorReporter<ReportType, DeviceType>::clear()
+{
+  int num_reports=0;
+  Kokkos::deep_copy(m_numReportsAttempted, num_reports);
+  m_reports.template modify<execution_space>();
+  m_reporters.template modify<execution_space>();
+}
+
+template <typename ReportType, typename DeviceType>
+void ErrorReporter<ReportType, DeviceType>::resize(const size_t new_size)
+{
+  m_reports.resize(new_size);
+  m_reporters.resize(new_size);
+  Kokkos::fence();
+}
+
+
+} // namespace Experimental
+} // namespace kokkos
+
+#endif
+
diff --git a/packages/kokkos/containers/src/Kokkos_Functional.hpp b/packages/kokkos/containers/src/Kokkos_Functional.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d256cce270d8edf728dfe84611230e5160c5a3c
--- /dev/null
+++ b/packages/kokkos/containers/src/Kokkos_Functional.hpp
@@ -0,0 +1,172 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_FUNCTIONAL_HPP
+#define KOKKOS_FUNCTIONAL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Functional_impl.hpp>
+
+namespace Kokkos {
+
+// These should work for most types
+
+template <typename T>
+struct pod_hash
+{
+  typedef T argument_type;
+  typedef T first_argument_type;
+  typedef uint32_t second_argument_type;
+  typedef uint32_t result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t operator()(T const & t) const
+  { return Impl::MurmurHash3_x86_32( &t, sizeof(T), 0); }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t operator()(T const & t, uint32_t seed) const
+  { return Impl::MurmurHash3_x86_32( &t, sizeof(T), seed); }
+};
+
+template <typename T>
+struct pod_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return Impl::bitwise_equal(&a,&b); }
+};
+
+template <typename T>
+struct pod_not_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return !Impl::bitwise_equal(&a,&b); }
+};
+
+template <typename T>
+struct equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a == b; }
+};
+
+template <typename T>
+struct not_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a != b; }
+};
+
+
+template <typename T>
+struct greater
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a > b; }
+};
+
+
+template <typename T>
+struct less
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a < b; }
+};
+
+template <typename T>
+struct greater_equal
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a >= b; }
+};
+
+
+template <typename T>
+struct less_equal
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a <= b; }
+};
+
+} // namespace Kokkos
+
+
+#endif //KOKKOS_FUNCTIONAL_HPP
+
diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c3365408ad184ac01943625b0259da18105b2ed
--- /dev/null
+++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
@@ -0,0 +1,994 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+/// \file Kokkos_ScatterView.hpp
+/// \brief Declaration and definition of Kokkos::ScatterView.
+///
+/// This header file declares and defines Kokkos::ScatterView and its
+/// related nonmember functions.
+
+#ifndef KOKKOS_SCATTER_VIEW_HPP
+#define KOKKOS_SCATTER_VIEW_HPP
+
+#include <Kokkos_Core.hpp>
+#include <utility>
+
+namespace Kokkos {
+namespace Experimental {
+
+//TODO: replace this enum with the Kokkos::Sum, etc reducers for parallel_reduce
+enum : int {
+  ScatterSum,
+};
+
+enum : int {
+  ScatterNonDuplicated = 0,
+  ScatterDuplicated    = 1
+};
+
+enum : int {
+  ScatterNonAtomic = 0,
+  ScatterAtomic    = 1
+};
+
+}} // Kokkos::Experimental
+
+namespace Kokkos {
+namespace Impl {
+namespace Experimental {
+
+template <typename ExecSpace>
+struct DefaultDuplication;
+
+template <typename ExecSpace, int duplication>
+struct DefaultContribution;
+
+#ifdef KOKKOS_ENABLE_SERIAL
+template <>
+struct DefaultDuplication<Kokkos::Serial> {
+  enum : int { value = Kokkos::Experimental::ScatterNonDuplicated };
+};
+template <>
+struct DefaultContribution<Kokkos::Serial, Kokkos::Experimental::ScatterNonDuplicated> {
+  enum : int { value = Kokkos::Experimental::ScatterNonAtomic };
+};
+template <>
+struct DefaultContribution<Kokkos::Serial, Kokkos::Experimental::ScatterDuplicated> {
+  enum : int { value = Kokkos::Experimental::ScatterNonAtomic };
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+template <>
+struct DefaultDuplication<Kokkos::OpenMP> {
+  enum : int { value = Kokkos::Experimental::ScatterDuplicated };
+};
+template <>
+struct DefaultContribution<Kokkos::OpenMP, Kokkos::Experimental::ScatterNonDuplicated> {
+  enum : int { value = Kokkos::Experimental::ScatterAtomic };
+};
+template <>
+struct DefaultContribution<Kokkos::OpenMP, Kokkos::Experimental::ScatterDuplicated> {
+  enum : int { value = Kokkos::Experimental::ScatterNonAtomic };
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+template <>
+struct DefaultDuplication<Kokkos::Threads> {
+  enum : int { value = Kokkos::Experimental::ScatterDuplicated };
+};
+template <>
+struct DefaultContribution<Kokkos::Threads, Kokkos::Experimental::ScatterNonDuplicated> {
+  enum : int { value = Kokkos::Experimental::ScatterAtomic };
+};
+template <>
+struct DefaultContribution<Kokkos::Threads, Kokkos::Experimental::ScatterDuplicated> {
+  enum : int { value = Kokkos::Experimental::ScatterNonAtomic };
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+struct DefaultDuplication<Kokkos::Cuda> {
+  enum : int { value = Kokkos::Experimental::ScatterNonDuplicated };
+};
+template <>
+struct DefaultContribution<Kokkos::Cuda, Kokkos::Experimental::ScatterNonDuplicated> {
+  enum : int { value = Kokkos::Experimental::ScatterAtomic };
+};
+template <>
+struct DefaultContribution<Kokkos::Cuda, Kokkos::Experimental::ScatterDuplicated> {
+  enum : int { value = Kokkos::Experimental::ScatterAtomic };
+};
+#endif
+
+/* ScatterValue is the object returned by the access operator() of ScatterAccess,
+   similar to that returned by an Atomic View, it wraps Kokkos::atomic_add with convenient
+   operator+=, etc. */
+template <typename ValueType, int Op, int contribution>
+struct ScatterValue;
+
+template <typename ValueType>
+struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonAtomic> {
+  public:
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : value( value_in ) {}
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) : value( other.value ) {}
+    KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) {
+      value += rhs;
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) {
+      value -= rhs;
+    }
+  private:
+    ValueType& value;
+};
+
+template <typename ValueType>
+struct ScatterValue<ValueType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterAtomic> {
+  public:
+    KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : value( value_in ) {}
+    KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) {
+      Kokkos::atomic_add(&value, rhs);
+    }
+    KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) {
+      Kokkos::atomic_add(&value, -rhs);
+    }
+  private:
+    ValueType& value;
+};
+
+/* DuplicatedDataType, given a View DataType, will create a new DataType
+   that has a new runtime dimension which becomes the largest-stride dimension.
+   In the case of LayoutLeft, due to the limitation induced by the design of DataType
+   itself, it must convert any existing compile-time dimensions into runtime dimensions. */
+template <typename T, typename Layout>
+struct DuplicatedDataType;
+
+template <typename T>
+struct DuplicatedDataType<T, Kokkos::LayoutRight> {
+  typedef T* value_type; // For LayoutRight, add a star all the way on the left
+};
+
+template <typename T, size_t N>
+struct DuplicatedDataType<T[N], Kokkos::LayoutRight> {
+  typedef typename DuplicatedDataType<T, Kokkos::LayoutRight>::value_type value_type[N];
+};
+
+template <typename T>
+struct DuplicatedDataType<T[], Kokkos::LayoutRight> {
+  typedef typename DuplicatedDataType<T, Kokkos::LayoutRight>::value_type value_type[];
+};
+
+template <typename T>
+struct DuplicatedDataType<T*, Kokkos::LayoutRight> {
+  typedef typename DuplicatedDataType<T, Kokkos::LayoutRight>::value_type* value_type;
+};
+
+template <typename T>
+struct DuplicatedDataType<T, Kokkos::LayoutLeft> {
+  typedef T* value_type;
+};
+
+template <typename T, size_t N>
+struct DuplicatedDataType<T[N], Kokkos::LayoutLeft> {
+  typedef typename DuplicatedDataType<T, Kokkos::LayoutLeft>::value_type* value_type;
+};
+
+template <typename T>
+struct DuplicatedDataType<T[], Kokkos::LayoutLeft> {
+  typedef typename DuplicatedDataType<T, Kokkos::LayoutLeft>::value_type* value_type;
+};
+
+template <typename T>
+struct DuplicatedDataType<T*, Kokkos::LayoutLeft> {
+  typedef typename DuplicatedDataType<T, Kokkos::LayoutLeft>::value_type* value_type;
+};
+
+/* Slice is just responsible for stuffing the correct number of Kokkos::ALL
+   arguments on the correct side of the index in a call to subview() to get a
+   subview where the index specified is the largest-stride one. */
+template <typename Layout, int rank, typename V, typename ... Args>
+struct Slice {
+  typedef Slice<Layout, rank - 1, V, Kokkos::Impl::ALL_t, Args...> next;
+  typedef typename next::value_type value_type;
+
+  static
+  value_type get(V const& src, const size_t i, Args ... args) {
+    return next::get(src, i, Kokkos::ALL, args...);
+  }
+};
+
+template <typename V, typename ... Args>
+struct Slice<Kokkos::LayoutRight, 1, V, Args...> {
+  typedef typename Kokkos::Impl::ViewMapping
+                          < void
+                          , V
+                          , const size_t
+                          , Args ...
+                          >::type value_type;
+  static
+  value_type get(V const& src, const size_t i, Args ... args) {
+    return Kokkos::subview(src, i, args...);
+  }
+};
+
+template <typename V, typename ... Args>
+struct Slice<Kokkos::LayoutLeft, 1, V, Args...> {
+  typedef typename Kokkos::Impl::ViewMapping
+                          < void
+                          , V
+                          , Args ...
+                          , const size_t
+                          >::type value_type;
+  static
+  value_type get(V const& src, const size_t i, Args ... args) {
+    return Kokkos::subview(src, args..., i);
+  }
+};
+
+template <typename ExecSpace, typename ValueType, int Op>
+struct ReduceDuplicates;
+
+template <typename ExecSpace, typename ValueType, int Op>
+struct ReduceDuplicatesBase {
+  typedef ReduceDuplicates<ExecSpace, ValueType, Op> Derived;
+  ValueType const* src;
+  ValueType* dst;
+  size_t stride;
+  size_t start;
+  size_t n;
+  ReduceDuplicatesBase(ValueType const* src_in, ValueType* dest_in, size_t stride_in, size_t start_in, size_t n_in, std::string const& name)
+    : src(src_in)
+    , dst(dest_in)
+    , stride(stride_in)
+    , start(start_in)
+    , n(n_in)
+  {
+#if defined(KOKKOS_ENABLE_PROFILING)
+    uint64_t kpID = 0;
+    if(Kokkos::Profiling::profileLibraryLoaded()) {
+      Kokkos::Profiling::beginParallelFor(std::string("reduce_") + name, 0, &kpID);
+    }
+#endif
+    typedef RangePolicy<ExecSpace, size_t> policy_type;
+    typedef Kokkos::Impl::ParallelFor<Derived, policy_type> closure_type;
+    const closure_type closure(*(static_cast<Derived*>(this)), policy_type(0, stride));
+    closure.execute();
+#if defined(KOKKOS_ENABLE_PROFILING)
+    if(Kokkos::Profiling::profileLibraryLoaded()) {
+      Kokkos::Profiling::endParallelFor(kpID);
+    }
+#endif
+  }
+};
+
+template <typename ExecSpace, typename ValueType>
+struct ReduceDuplicates<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> :
+  public ReduceDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum>
+{
+  typedef ReduceDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> Base;
+  ReduceDuplicates(ValueType const* src_in, ValueType* dst_in, size_t stride_in, size_t start_in, size_t n_in, std::string const& name):
+    Base(src_in, dst_in, stride_in, start_in, n_in, name)
+  {}
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const {
+    for (size_t j = Base::start; j < Base::n; ++j) {
+      Base::dst[i] += Base::src[i + Base::stride * j];
+    }
+  }
+};
+
+template <typename ExecSpace, typename ValueType, int Op>
+struct ResetDuplicates;
+
+template <typename ExecSpace, typename ValueType, int Op>
+struct ResetDuplicatesBase {
+  typedef ResetDuplicates<ExecSpace, ValueType, Op> Derived;
+  ValueType* data;
+  ResetDuplicatesBase(ValueType* data_in, size_t size_in, std::string const& name)
+    : data(data_in)
+  {
+#if defined(KOKKOS_ENABLE_PROFILING)
+    uint64_t kpID = 0;
+    if(Kokkos::Profiling::profileLibraryLoaded()) {
+      Kokkos::Profiling::beginParallelFor(std::string("reduce_") + name, 0, &kpID);
+    }
+#endif
+    typedef RangePolicy<ExecSpace, size_t> policy_type;
+    typedef Kokkos::Impl::ParallelFor<Derived, policy_type> closure_type;
+    const closure_type closure(*(static_cast<Derived*>(this)), policy_type(0, size_in));
+    closure.execute();
+#if defined(KOKKOS_ENABLE_PROFILING)
+    if(Kokkos::Profiling::profileLibraryLoaded()) {
+      Kokkos::Profiling::endParallelFor(kpID);
+    }
+#endif
+  }
+};
+
+template <typename ExecSpace, typename ValueType>
+struct ResetDuplicates<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> :
+  public ResetDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum>
+{
+  typedef ResetDuplicatesBase<ExecSpace, ValueType, Kokkos::Experimental::ScatterSum> Base;
+  ResetDuplicates(ValueType* data_in, size_t size_in, std::string const& name):
+    Base(data_in, size_in, name)
+  {}
+  KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const {
+    Base::data[i] = Kokkos::reduction_identity<ValueType>::sum();
+  }
+};
+
+}}} // Kokkos::Impl::Experimental
+
+namespace Kokkos {
+namespace Experimental {
+
+template <typename DataType
+         ,typename Layout = Kokkos::DefaultExecutionSpace::array_layout
+         ,typename ExecSpace = Kokkos::DefaultExecutionSpace
+         ,int Op = ScatterSum
+         ,int duplication = Kokkos::Impl::Experimental::DefaultDuplication<ExecSpace>::value
+         ,int contribution = Kokkos::Impl::Experimental::DefaultContribution<ExecSpace, duplication>::value
+         >
+class ScatterView;
+
+template <typename DataType
+         ,int Op
+         ,typename ExecSpace
+         ,typename Layout
+         ,int duplication
+         ,int contribution
+         ,int override_contribution
+         >
+class ScatterAccess;
+
+// non-duplicated implementation
+template <typename DataType
+         ,int Op
+         ,typename ExecSpace
+         ,typename Layout
+         ,int contribution
+         >
+class ScatterView<DataType
+                   ,Layout
+                   ,ExecSpace
+                   ,Op
+                   ,ScatterNonDuplicated
+                   ,contribution>
+{
+public:
+  typedef Kokkos::View<DataType, Layout, ExecSpace> original_view_type;
+  typedef typename original_view_type::value_type original_value_type;
+  typedef typename original_view_type::reference_type original_reference_type;
+  friend class ScatterAccess<DataType, Op, ExecSpace, Layout, ScatterNonDuplicated, contribution, ScatterNonAtomic>;
+  friend class ScatterAccess<DataType, Op, ExecSpace, Layout, ScatterNonDuplicated, contribution, ScatterAtomic>;
+
+  ScatterView()
+  {
+  }
+
+  template <typename RT, typename ... RP>
+  ScatterView(View<RT, RP...> const& original_view)
+  : internal_view(original_view)
+  {
+  }
+
+  template <typename ... Dims>
+  ScatterView(std::string const& name, Dims ... dims)
+  : internal_view(name, dims ...)
+  {
+  }
+
+  template <int override_contrib = contribution>
+  KOKKOS_FORCEINLINE_FUNCTION
+  ScatterAccess<DataType, Op, ExecSpace, Layout, ScatterNonDuplicated, contribution, override_contrib>
+  access() const {
+    return ScatterAccess<DataType, Op, ExecSpace, Layout, ScatterNonDuplicated, contribution, override_contrib>{*this};
+  }
+
+  original_view_type subview() const {
+    return internal_view;
+  }
+
+  template <typename DT, typename ... RP>
+  void contribute_into(View<DT, RP...> const& dest) const
+  {
+    typedef View<DT, RP...> dest_type;
+    static_assert(std::is_same<
+        typename dest_type::array_layout,
+        Layout>::value,
+        "ScatterView contribute destination has different layout");
+    static_assert(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
+        typename ExecSpace::memory_space,
+        typename dest_type::memory_space>::value,
+        "ScatterView contribute destination memory space not accessible");
+    if (dest.data() == internal_view.data()) return;
+    Kokkos::Impl::Experimental::ReduceDuplicates<ExecSpace, original_value_type, Op>(
+        internal_view.data(),
+        dest.data(),
+        0,
+        0,
+        1,
+        internal_view.label());
+  }
+
+  void reset() {
+    Kokkos::Impl::Experimental::ResetDuplicates<ExecSpace, original_value_type, Op>(
+        internal_view.data(),
+        internal_view.size(),
+        internal_view.label());
+  }
+  template <typename DT, typename ... RP>
+  void reset_except(View<DT, RP...> const& view) {
+    if (view.data() != internal_view.data()) reset();
+  }
+
+  void resize(const size_t n0 = 0,
+           const size_t n1 = 0,
+           const size_t n2 = 0,
+           const size_t n3 = 0,
+           const size_t n4 = 0,
+           const size_t n5 = 0,
+           const size_t n6 = 0,
+           const size_t n7 = 0) {
+    ::Kokkos::resize(internal_view,n0,n1,n2,n3,n4,n5,n6,n7);
+  }
+
+  void realloc(const size_t n0 = 0,
+           const size_t n1 = 0,
+           const size_t n2 = 0,
+           const size_t n3 = 0,
+           const size_t n4 = 0,
+           const size_t n5 = 0,
+           const size_t n6 = 0,
+           const size_t n7 = 0) {
+    ::Kokkos::realloc(internal_view,n0,n1,n2,n3,n4,n5,n6,n7);
+  }
+
+protected:
+  template <typename ... Args>
+  KOKKOS_FORCEINLINE_FUNCTION
+  original_reference_type at(Args ... args) const {
+    return internal_view(args...);
+  }
+private:
+  typedef original_view_type internal_view_type;
+  internal_view_type internal_view;
+};
+
+template <typename DataType
+         ,int Op
+         ,typename ExecSpace
+         ,typename Layout
+         ,int contribution
+         ,int override_contribution
+         >
+class ScatterAccess<DataType
+                   ,Op
+                   ,ExecSpace
+                   ,Layout
+                   ,ScatterNonDuplicated
+                   ,contribution
+                   ,override_contribution>
+{
+public:
+  typedef ScatterView<DataType, Layout, ExecSpace, Op, ScatterNonDuplicated, contribution> view_type;
+  typedef typename view_type::original_value_type original_value_type;
+  typedef Kokkos::Impl::Experimental::ScatterValue<
+      original_value_type, Op, override_contribution> value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  ScatterAccess(view_type const& view_in)
+    : view(view_in)
+  {
+  }
+
+  template <typename ... Args>
+  KOKKOS_FORCEINLINE_FUNCTION
+  value_type operator()(Args ... args) const {
+    return view.at(args...);
+  }
+
+  template <typename Arg>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<view_type::original_view_type::rank == 1 &&
+  std::is_integral<Arg>::value, value_type>::type
+  operator[](Arg arg) const {
+    return view.at(arg);
+  }
+
+private:
+  view_type const& view;
+};
+
+// duplicated implementation
+// LayoutLeft and LayoutRight are different enough that we'll just specialize each
+
+template <typename DataType
+         ,int Op
+         ,typename ExecSpace
+         ,int contribution
+         >
+class ScatterView<DataType
+                   ,Kokkos::LayoutRight
+                   ,ExecSpace
+                   ,Op
+                   ,ScatterDuplicated
+                   ,contribution>
+{
+public:
+  typedef Kokkos::View<DataType, Kokkos::LayoutRight, ExecSpace> original_view_type;
+  typedef typename original_view_type::value_type original_value_type;
+  typedef typename original_view_type::reference_type original_reference_type;
+  friend class ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutRight, ScatterDuplicated, contribution, ScatterNonAtomic>;
+  friend class ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutRight, ScatterDuplicated, contribution, ScatterAtomic>;
+  typedef typename Kokkos::Impl::Experimental::DuplicatedDataType<DataType, Kokkos::LayoutRight> data_type_info;
+  typedef typename data_type_info::value_type internal_data_type;
+  typedef Kokkos::View<internal_data_type, Kokkos::LayoutRight, ExecSpace> internal_view_type;
+
+  ScatterView()
+  {
+  }
+
+  template <typename RT, typename ... RP >
+  ScatterView(View<RT, RP...> const& original_view)
+  : unique_token()
+  , internal_view(Kokkos::ViewAllocateWithoutInitializing(
+                    std::string("duplicated_") + original_view.label()),
+                  unique_token.size(),
+                  original_view.extent(0),
+                  original_view.extent(1),
+                  original_view.extent(2),
+                  original_view.extent(3),
+                  original_view.extent(4),
+                  original_view.extent(5),
+                  original_view.extent(6))
+  {
+    reset();
+  }
+
+  template <typename ... Dims>
+  ScatterView(std::string const& name, Dims ... dims)
+  : internal_view(Kokkos::ViewAllocateWithoutInitializing(name), unique_token.size(), dims ...)
+  {
+    reset();
+  }
+
+  template <int override_contribution = contribution>
+  inline
+  ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutRight, ScatterDuplicated, contribution, override_contribution>
+  access() const {
+    return ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutRight, ScatterDuplicated, contribution, override_contribution>{*this};
+  }
+
+  typename Kokkos::Impl::Experimental::Slice<
+    Kokkos::LayoutRight, internal_view_type::rank, internal_view_type>::value_type
+  subview() const
+  {
+    return Kokkos::Impl::Experimental::Slice<
+      Kokkos::LayoutRight, internal_view_type::Rank, internal_view_type>::get(internal_view, 0);
+  }
+
+  template <typename DT, typename ... RP>
+  void contribute_into(View<DT, RP...> const& dest) const
+  {
+    typedef View<DT, RP...> dest_type;
+    static_assert(std::is_same<
+        typename dest_type::array_layout,
+        Kokkos::LayoutRight>::value,
+        "ScatterView deep_copy destination has different layout");
+    static_assert(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
+        typename ExecSpace::memory_space,
+        typename dest_type::memory_space>::value,
+        "ScatterView deep_copy destination memory space not accessible");
+    bool is_equal = (dest.data() == internal_view.data());
+    size_t start = is_equal ? 1 : 0;
+    Kokkos::Impl::Experimental::ReduceDuplicates<ExecSpace, original_value_type, Op>(
+        internal_view.data(),
+        dest.data(),
+        internal_view.stride(0),
+        start,
+        internal_view.extent(0),
+        internal_view.label());
+  }
+
+  void reset() {
+    Kokkos::Impl::Experimental::ResetDuplicates<ExecSpace, original_value_type, Op>(
+        internal_view.data(),
+        internal_view.size(),
+        internal_view.label());
+  }
+  template <typename DT, typename ... RP>
+  void reset_except(View<DT, RP...> const& view) {
+    if (view.data() != internal_view.data()) {
+      reset();
+      return;
+    }
+    Kokkos::Impl::Experimental::ResetDuplicates<ExecSpace, original_value_type, Op>(
+        internal_view.data() + view.size(),
+        internal_view.size() - view.size(),
+        internal_view.label());
+  }
+
+  void resize(const size_t n0 = 0,
+           const size_t n1 = 0,
+           const size_t n2 = 0,
+           const size_t n3 = 0,
+           const size_t n4 = 0,
+           const size_t n5 = 0,
+           const size_t n6 = 0) {
+    ::Kokkos::resize(internal_view,unique_token.size(),n0,n1,n2,n3,n4,n5,n6);
+  }
+
+  void realloc(const size_t n0 = 0,
+           const size_t n1 = 0,
+           const size_t n2 = 0,
+           const size_t n3 = 0,
+           const size_t n4 = 0,
+           const size_t n5 = 0,
+           const size_t n6 = 0) {
+    ::Kokkos::realloc(internal_view,unique_token.size(),n0,n1,n2,n3,n4,n5,n6);
+  }
+
+protected:
+  template <typename ... Args>
+  KOKKOS_FORCEINLINE_FUNCTION
+  original_reference_type at(int rank, Args ... args) const {
+    return internal_view(rank, args...);
+  }
+
+protected:
+  typedef Kokkos::Experimental::UniqueToken<
+      ExecSpace, Kokkos::Experimental::UniqueTokenScope::Global> unique_token_type;
+
+  unique_token_type unique_token;
+  internal_view_type internal_view;
+};
+
+template <typename DataType
+         ,int Op
+         ,typename ExecSpace
+         ,int contribution
+         >
+class ScatterView<DataType
+                   ,Kokkos::LayoutLeft
+                   ,ExecSpace
+                   ,Op
+                   ,ScatterDuplicated
+                   ,contribution>
+{
+public:
+  typedef Kokkos::View<DataType, Kokkos::LayoutLeft, ExecSpace> original_view_type;
+  typedef typename original_view_type::value_type original_value_type;
+  typedef typename original_view_type::reference_type original_reference_type;
+  friend class ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutLeft, ScatterDuplicated, contribution, ScatterNonAtomic>;
+  friend class ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutLeft, ScatterDuplicated, contribution, ScatterAtomic>;
+  typedef typename Kokkos::Impl::Experimental::DuplicatedDataType<DataType, Kokkos::LayoutLeft> data_type_info;
+  typedef typename data_type_info::value_type internal_data_type;
+  typedef Kokkos::View<internal_data_type, Kokkos::LayoutLeft, ExecSpace> internal_view_type;
+
+  ScatterView()
+  {
+  }
+
+  template <typename RT, typename ... RP >
+  ScatterView(View<RT, RP...> const& original_view)
+  : unique_token()
+  {
+    size_t arg_N[8] = {
+      original_view.extent(0),
+      original_view.extent(1),
+      original_view.extent(2),
+      original_view.extent(3),
+      original_view.extent(4),
+      original_view.extent(5),
+      original_view.extent(6),
+      0
+    };
+    arg_N[internal_view_type::rank - 1] = unique_token.size();
+    internal_view = internal_view_type(
+        Kokkos::ViewAllocateWithoutInitializing(
+          std::string("duplicated_") + original_view.label()),
+        arg_N[0], arg_N[1], arg_N[2], arg_N[3],
+        arg_N[4], arg_N[5], arg_N[6], arg_N[7]);
+    reset();
+  }
+
+  template <typename ... Dims>
+  ScatterView(std::string const& name, Dims ... dims)
+  : internal_view(Kokkos::ViewAllocateWithoutInitializing(name), dims ..., unique_token.size())
+  {
+    reset();
+  }
+
+  template <int override_contribution = contribution>
+  inline
+  ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutLeft, ScatterDuplicated, contribution, override_contribution>
+  access() const {
+    return ScatterAccess<DataType, Op, ExecSpace, Kokkos::LayoutLeft, ScatterDuplicated, contribution, override_contribution>{*this};
+  }
+
+  typename Kokkos::Impl::Experimental::Slice<
+    Kokkos::LayoutLeft, internal_view_type::rank, internal_view_type>::value_type
+  subview() const
+  {
+    return Kokkos::Impl::Experimental::Slice<
+      Kokkos::LayoutLeft, internal_view_type::rank, internal_view_type>::get(internal_view, 0);
+  }
+
+  template <typename ... RP>
+  void contribute_into(View<DataType, RP...> const& dest) const
+  {
+    typedef View<DataType, RP...> dest_type;
+    static_assert(std::is_same<
+        typename dest_type::array_layout,
+        Kokkos::LayoutLeft>::value,
+        "ScatterView deep_copy destination has different layout");
+    static_assert(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
+        typename ExecSpace::memory_space,
+        typename dest_type::memory_space>::value,
+        "ScatterView deep_copy destination memory space not accessible");
+    auto extent = internal_view.extent(
+        internal_view_type::rank - 1);
+    bool is_equal = (dest.data() == internal_view.data());
+    size_t start = is_equal ? 1 : 0;
+    Kokkos::Impl::Experimental::ReduceDuplicates<ExecSpace, original_value_type, Op>(
+        internal_view.data(),
+        dest.data(),
+        internal_view.stride(internal_view_type::rank - 1),
+        start,
+        extent,
+        internal_view.label());
+  }
+
+  void reset() {
+    Kokkos::Impl::Experimental::ResetDuplicates<ExecSpace, original_value_type, Op>(
+        internal_view.data(),
+        internal_view.size(),
+        internal_view.label());
+  }
+  template <typename DT, typename ... RP>
+  void reset_except(View<DT, RP...> const& view) {
+    if (view.data() != internal_view.data()) {
+      reset();
+      return;
+    }
+    Kokkos::Impl::Experimental::ResetDuplicates<ExecSpace, original_value_type, Op>(
+        internal_view.data() + view.size(),
+        internal_view.size() - view.size(),
+        internal_view.label());
+  }
+
+  void resize(const size_t n0 = 0,
+           const size_t n1 = 0,
+           const size_t n2 = 0,
+           const size_t n3 = 0,
+           const size_t n4 = 0,
+           const size_t n5 = 0,
+           const size_t n6 = 0) {
+
+    size_t arg_N[8] = {n0,n1,n2,n3,n4,n5,n6,0};
+    const int i = internal_view.rank-1;
+    arg_N[i] = unique_token.size();
+
+    ::Kokkos::resize(internal_view,
+        arg_N[0], arg_N[1], arg_N[2], arg_N[3],
+        arg_N[4], arg_N[5], arg_N[6], arg_N[7]);
+  }
+
+  void realloc(const size_t n0 = 0,
+           const size_t n1 = 0,
+           const size_t n2 = 0,
+           const size_t n3 = 0,
+           const size_t n4 = 0,
+           const size_t n5 = 0,
+           const size_t n6 = 0) {
+
+    size_t arg_N[8] = {n0,n1,n2,n3,n4,n5,n6,0};
+    const int i = internal_view.rank-1;
+    arg_N[i] = unique_token.size();
+
+    ::Kokkos::realloc(internal_view,
+        arg_N[0], arg_N[1], arg_N[2], arg_N[3],
+        arg_N[4], arg_N[5], arg_N[6], arg_N[7]);
+  }
+
+protected:
+  template <typename ... Args>
+  inline original_reference_type at(int thread_id, Args ... args) const {
+    return internal_view(args..., thread_id);
+  }
+
+protected:
+  typedef Kokkos::Experimental::UniqueToken<
+      ExecSpace, Kokkos::Experimental::UniqueTokenScope::Global> unique_token_type;
+
+  unique_token_type unique_token;
+  internal_view_type internal_view;
+};
+
+
+/* This object has to be separate in order to store the thread ID, which cannot
+   be obtained until one is inside a parallel construct, and may be relatively
+   expensive to obtain at every contribution
+   (calls a non-inlined function, looks up a thread-local variable).
+   Due to the expense, it is sensible to query it at most once per parallel iterate
+   (ideally once per thread, but parallel_for doesn't expose that)
+   and then store it in a stack variable.
+   ScatterAccess serves as a non-const object on the stack which can store the thread ID */
+
+template <typename DataType
+         ,int Op
+         ,typename ExecSpace
+         ,typename Layout
+         ,int contribution
+         ,int override_contribution
+         >
+class ScatterAccess<DataType
+                   ,Op
+                   ,ExecSpace
+                   ,Layout
+                   ,ScatterDuplicated
+                   ,contribution
+                   ,override_contribution>
+{
+public:
+  typedef ScatterView<DataType, Layout, ExecSpace, Op, ScatterDuplicated, contribution> view_type;
+  typedef typename view_type::original_value_type original_value_type;
+  typedef Kokkos::Impl::Experimental::ScatterValue<
+      original_value_type, Op, override_contribution> value_type;
+
+  inline ScatterAccess(view_type const& view_in)
+    : view(view_in)
+    , thread_id(view_in.unique_token.acquire()) {
+  }
+
+  inline ~ScatterAccess() {
+    if (thread_id != ~thread_id_type(0)) view.unique_token.release(thread_id);
+  }
+
+  template <typename ... Args>
+  KOKKOS_FORCEINLINE_FUNCTION
+  value_type operator()(Args ... args) const {
+    return view.at(thread_id, args...);
+  }
+
+  template <typename Arg>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<view_type::original_view_type::rank == 1 &&
+  std::is_integral<Arg>::value, value_type>::type
+  operator[](Arg arg) const {
+    return view.at(thread_id, arg);
+  }
+
+private:
+
+  view_type const& view;
+
+  // simplify RAII by disallowing copies
+  ScatterAccess(ScatterAccess const& other) = delete;
+  ScatterAccess& operator=(ScatterAccess const& other) = delete;
+  ScatterAccess& operator=(ScatterAccess&& other) = delete;
+
+public:
+  // do need to allow moves though, for the common
+  // auto b = a.access();
+  // that assignments turns into a move constructor call 
+  inline ScatterAccess(ScatterAccess&& other)
+    : view(other.view)
+    , thread_id(other.thread_id)
+  {
+    other.thread_id = ~thread_id_type(0);
+  }
+
+private:
+
+  typedef typename view_type::unique_token_type unique_token_type;
+  typedef typename unique_token_type::size_type thread_id_type;
+  thread_id_type thread_id;
+};
+
+template <int Op = Kokkos::Experimental::ScatterSum,
+          int duplication = -1,
+          int contribution = -1,
+          typename RT, typename ... RP>
+ScatterView
+  < RT
+  , typename ViewTraits<RT, RP...>::array_layout
+  , typename ViewTraits<RT, RP...>::execution_space
+  , Op
+  /* just setting defaults if not specified... things got messy because the view type
+     does not come before the duplication/contribution settings in the
+     template parameter list */
+  , duplication == -1 ? Kokkos::Impl::Experimental::DefaultDuplication<typename ViewTraits<RT, RP...>::execution_space>::value : duplication
+  , contribution == -1 ?
+      Kokkos::Impl::Experimental::DefaultContribution<
+                        typename ViewTraits<RT, RP...>::execution_space,
+                        (duplication == -1 ?
+                           Kokkos::Impl::Experimental::DefaultDuplication<
+                             typename ViewTraits<RT, RP...>::execution_space
+                             >::value
+                                           : duplication
+                        )
+                        >::value
+                       : contribution
+  >
+create_scatter_view(View<RT, RP...> const& original_view) {
+  return original_view; // implicit ScatterView constructor call
+}
+
+}} // namespace Kokkos::Experimental
+
+namespace Kokkos {
+namespace Experimental {
+
+template <typename DT1, typename DT2, typename LY, typename ES,  int OP, int CT, int DP, typename ... VP>
+void
+contribute(View<DT1, VP...>& dest, Kokkos::Experimental::ScatterView<DT2, LY, ES, OP, CT, DP> const& src)
+{
+  src.contribute_into(dest);
+}
+
+}} // namespace Kokkos::Experimental
+
+namespace Kokkos {
+
+template <typename DT, typename LY, typename ES,  int OP, int CT, int DP, typename ... IS>
+void
+realloc(Kokkos::Experimental::ScatterView<DT, LY, ES, OP, CT, DP>& scatter_view, IS ... is)
+{
+  scatter_view.realloc(is ...);
+}
+
+template <typename DT, typename LY, typename ES,  int OP, int CT, int DP, typename ... IS>
+void
+resize(Kokkos::Experimental::ScatterView<DT, LY, ES, OP, CT, DP>& scatter_view, IS ... is)
+{
+  scatter_view.resize(is ...);
+}
+
+} // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..500bf8f424e9fd9ef243078147e46adecc8bea46
--- /dev/null
+++ b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -0,0 +1,472 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STATICCRSGRAPH_HPP
+#define KOKKOS_STATICCRSGRAPH_HPP
+
+#include <string>
+#include <vector>
+
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+
+namespace Impl {
+  template<class RowOffsetsType, class RowBlockOffsetsType>
+  struct StaticCrsGraphBalancerFunctor {
+    typedef typename RowOffsetsType::non_const_value_type int_type;
+    RowOffsetsType row_offsets;
+    RowBlockOffsetsType row_block_offsets;
+
+    int_type cost_per_row, num_blocks;
+
+    StaticCrsGraphBalancerFunctor(RowOffsetsType row_offsets_,
+                                  RowBlockOffsetsType row_block_offsets_,
+                                  int_type cost_per_row_, int_type num_blocks_):
+                                    row_offsets(row_offsets_),
+                                    row_block_offsets(row_block_offsets_),
+                                    cost_per_row(cost_per_row_),
+                                    num_blocks(num_blocks_){}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int_type& iRow) const {
+      const int_type num_rows = row_offsets.extent(0)-1;
+      const int_type num_entries = row_offsets(num_rows);
+      const int_type total_cost = num_entries + num_rows*cost_per_row;
+
+      const double cost_per_workset = 1.0*total_cost/num_blocks;
+
+      const int_type row_cost = row_offsets(iRow+1)-row_offsets(iRow) + cost_per_row;
+
+      int_type count = row_offsets(iRow+1) + cost_per_row*iRow;
+
+      if(iRow == num_rows-1) row_block_offsets(num_blocks) = num_rows;
+
+      if(true) {
+        int_type current_block = (count-row_cost-cost_per_row)/cost_per_workset;
+        int_type end_block = count/cost_per_workset;
+
+        // Handle some corner cases for the last two blocks.
+        if(current_block >= num_blocks-2) {
+          if((current_block == num_blocks-2) && (count >= (current_block + 1) * cost_per_workset)) {
+            int_type row = iRow;
+            int_type cc = count-row_cost-cost_per_row;
+            int_type block = cc/cost_per_workset;
+            while((block>0) && (block==current_block)) {
+              cc = row_offsets(row)+row*cost_per_row;
+              block = cc/cost_per_workset;
+              row--;
+            }
+            if((count-cc-row_cost-cost_per_row) < num_entries-row_offsets(iRow+1)) {
+              row_block_offsets(current_block+1) = iRow+1;
+            } else {
+              row_block_offsets(current_block+1) = iRow;
+            }
+          }
+        } else {
+          if((count >= (current_block + 1) * cost_per_workset) ||
+             (iRow+2 == row_offsets.extent(0))) {
+            if(end_block>current_block+1) {
+              int_type num_block = end_block-current_block;
+              row_block_offsets(current_block+1) = iRow;
+              for(int_type block = current_block+2; block <= end_block; block++)
+                if((block<current_block+2+(num_block-1)/2))
+                  row_block_offsets(block) = iRow;
+                else
+                  row_block_offsets(block) = iRow+1;
+            } else {
+              row_block_offsets(current_block+1) = iRow+1;
+            }
+          }
+        }
+
+      }
+    }
+  };
+}
+
+/// \class GraphRowViewConst
+/// \brief View of a row of a sparse graph.
+/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph.
+///
+/// This class provides a generic view of a row of a sparse graph.
+/// We intended this class to view a row of a StaticCrsGraph, but
+/// GraphType need not necessarily be CrsMatrix.
+///
+/// The row view is suited for computational kernels like sparse
+/// matrix-vector multiply, as well as for modifying entries in the
+/// sparse matrix.  The view is always const as it does not allow graph modification.
+///
+/// Here is an example loop over the entries in the row:
+/// \code
+/// typedef typename GraphRowViewConst<MatrixType>::ordinal_type ordinal_type;
+///
+/// GraphRowView<GraphType> G_i = ...;
+/// const ordinal_type numEntries = G_i.length;
+/// for (ordinal_type k = 0; k < numEntries; ++k) {
+///   ordinal_type j = G_i.colidx (k);
+///   // ... do something with A_ij and j ...
+/// }
+/// \endcode
+///
+/// GraphType must provide the \c data_type
+/// typedefs. In addition, it must make sense to use GraphRowViewConst to
+/// view a row of GraphType. In particular, column
+/// indices of a row must be accessible using the <tt>entries</tt>
+/// resp. <tt>colidx</tt> arrays given to the constructor of this
+/// class, with a constant <tt>stride</tt> between successive entries.
+/// The stride is one for the compressed sparse row storage format (as
+/// is used by CrsMatrix), but may be greater than one for other
+/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal).
+template<class GraphType>
+struct GraphRowViewConst {
+  //! The type of the column indices in the row.
+  typedef const typename GraphType::data_type ordinal_type;
+
+private:
+  //! Array of (local) column indices in the row.
+  ordinal_type* colidx_;
+  /// \brief Stride between successive entries in the row.
+  ///
+  /// For compressed sparse row (CSR) storage, this is always one.
+  /// This might be greater than one for storage formats like ELLPACK
+  /// or Jagged Diagonal.  Nevertheless, the stride can never be
+  /// greater than the number of rows or columns in the matrix.  Thus,
+  /// \c ordinal_type is the correct type.
+  const ordinal_type stride_;
+
+public:
+  /// \brief Constructor
+  ///
+  /// \param values [in] Array of the row's values.
+  /// \param colidx [in] Array of the row's column indices.
+  /// \param stride [in] (Constant) stride between matrix entries in
+  ///   each of the above arrays.
+  /// \param count [in] Number of entries in the row.
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst ( ordinal_type* const colidx_in,
+                      const ordinal_type& stride,
+                      const ordinal_type& count) :
+    colidx_ (colidx_in), stride_ (stride), length (count)
+  {}
+
+  /// \brief Constructor with offset into \c colidx array
+  ///
+  /// \param colidx [in] Array of the row's column indices.
+  /// \param stride [in] (Constant) stride between matrix entries in
+  ///   each of the above arrays.
+  /// \param count [in] Number of entries in the row.
+  /// \param idx [in] Start offset into \c colidx array
+  ///
+  /// \tparam OffsetType The type of \c idx (see above).  Must be a
+  ///   built-in integer type.  This may differ from ordinal_type.
+  ///   For example, the matrix may have dimensions that fit in int,
+  ///   but a number of entries that does not fit in int.
+  template<class OffsetType>
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst ( const typename GraphType::entries_type& colidx_in,
+                      const ordinal_type& stride,
+                      const ordinal_type& count,
+                      const OffsetType& idx,
+                      const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
+    colidx_ (&colidx_in(idx)), stride_ (stride), length (count)
+  {}
+
+  /// \brief Number of entries in the row.
+  ///
+  /// This is a public const field rather than a public const method,
+  /// in order to avoid possible overhead of a method call if the
+  /// compiler is unable to inline that method call.
+  ///
+  /// We assume that rows contain no duplicate entries (i.e., entries
+  /// with the same column index).  Thus, a row may have up to
+  /// A.numCols() entries.  This means that the correct type of
+  /// 'length' is ordinal_type.
+  const ordinal_type length;
+
+  /// \brief (Const) reference to the column index of entry i in this
+  ///   row of the sparse matrix.
+  ///
+  /// "Entry i" is not necessarily the entry with column index i, nor
+  /// does i necessarily correspond to the (local) row index.
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type& colidx (const ordinal_type& i) const {
+    return colidx_[i*stride_];
+  }
+
+  /// \brief An alias for colidx
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type& operator()(const ordinal_type& i) const {
+    return colidx(i);
+  }
+};
+
+
+/// \class StaticCrsGraph
+/// \brief Compressed row storage array.
+///
+/// \tparam DataType The type of stored entries.  If a StaticCrsGraph is
+///   used as the graph of a sparse matrix, then this is usually an
+///   integer type, the type of the column indices in the sparse
+///   matrix.
+///
+/// \tparam Arg1Type The second template parameter, corresponding
+///   either to the Device type (if there are no more template
+///   parameters) or to the Layout type (if there is at least one more
+///   template parameter).
+///
+/// \tparam Arg2Type The third template parameter, which if provided
+///   corresponds to the Device type.
+///
+/// \tparam SizeType The type of row offsets.  Usually the default
+///   parameter suffices.  However, setting a nondefault value is
+///   necessary in some cases, for example, if you want to have a
+///   sparse matrices with dimensions (and therefore column indices)
+///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
+///   entries in the sparse matrix.
+///
+/// A row has a range of entries:
+/// <ul>
+/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
+/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
+/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
+/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
+/// </ul>
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type = void,
+          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
+class StaticCrsGraph {
+private:
+  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
+
+public:
+  typedef DataType                                            data_type;
+  typedef typename traits::array_layout                       array_layout;
+  typedef typename traits::execution_space                    execution_space;
+  typedef typename traits::device_type                        device_type;
+  typedef SizeType                                            size_type;
+
+  typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type;
+  typedef StaticCrsGraph< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror;
+  typedef View< const size_type* , array_layout, device_type >  row_map_type;
+  typedef View<       DataType*  , array_layout, device_type >  entries_type;
+  typedef View< const size_type* , array_layout, device_type >  row_block_type;
+
+  entries_type entries;
+  row_map_type row_map;
+  row_block_type row_block_offsets;
+
+  //! Construct an empty view.
+  StaticCrsGraph () : entries(), row_map(), row_block_offsets() {}
+
+  //! Copy constructor (shallow copy).
+  StaticCrsGraph (const StaticCrsGraph& rhs) : entries (rhs.entries), row_map (rhs.row_map),
+                                               row_block_offsets(rhs.row_block_offsets)
+  {}
+
+  template<class EntriesType, class RowMapType>
+  StaticCrsGraph (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_),
+  row_block_offsets()
+  {}
+
+  /** \brief  Assign to a view of the rhs array.
+   *          If the old view is the last view
+   *          then allocated memory is deallocated.
+   */
+  StaticCrsGraph& operator= (const StaticCrsGraph& rhs) {
+    entries = rhs.entries;
+    row_map = rhs.row_map;
+    row_block_offsets = rhs.row_block_offsets;
+    return *this;
+  }
+
+  /**  \brief  Destroy this view of the array.
+   *           If the last view then allocated memory is deallocated.
+   */
+  ~StaticCrsGraph() {}
+
+  /**  \brief  Return number of rows in the graph
+   */
+  KOKKOS_INLINE_FUNCTION
+  size_type numRows() const {
+    return (row_map.extent(0) != 0) ?
+      row_map.extent(0) - static_cast<size_type> (1) :
+      static_cast<size_type> (0);
+  }
+
+  /// \brief Return a const view of row i of the graph.
+  ///
+  /// If row i does not belong to the graph, return an empty view.
+  ///
+  /// The returned object \c view implements the following interface:
+  /// <ul>
+  /// <li> \c view.length is the number of entries in the row </li>
+  /// <li> \c view.colidx(k) returns a const reference to the
+  ///      column index of the k-th entry in the row </li>
+  /// </ul>
+  /// k is not a column index; it just counts from 0 to
+  /// <tt>view.length - 1</tt>.
+  ///
+  /// Users should not rely on the return type of this method.  They
+  /// should instead assign to 'auto'.  That allows compile-time
+  /// polymorphism for different kinds of sparse matrix formats (e.g.,
+  /// ELLPACK or Jagged Diagonal) that we may wish to support in the
+  /// future.
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst<StaticCrsGraph> rowConst (const data_type i) const {
+    const size_type start = row_map(i);
+    // count is guaranteed to fit in ordinal_type, as long as no row
+    // has duplicate entries.
+    const data_type count = static_cast<data_type> (row_map(i+1) - start);
+
+    if (count == 0) {
+      return GraphRowViewConst<StaticCrsGraph> (NULL, 1, 0);
+    } else {
+      return GraphRowViewConst<StaticCrsGraph> (entries, 1, count, start);
+    }
+  }
+
+  /**  \brief  Create a row partitioning into a given number of blocks
+   *           balancing non-zeros + a fixed cost per row.
+   */
+  void create_block_partitioning(size_type num_blocks, size_type fix_cost_per_row = 4) {
+    View< size_type* , array_layout, device_type >
+      block_offsets("StatisCrsGraph::load_balance_offsets",num_blocks+1);
+
+    Impl::StaticCrsGraphBalancerFunctor<row_map_type,View< size_type* , array_layout, device_type > >
+      partitioner(row_map,block_offsets,fix_cost_per_row,num_blocks);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0,numRows()),partitioner);
+    Kokkos::fence();
+
+    row_block_offsets = block_offsets;
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class StaticCrsGraphType , class InputSizeType >
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< InputSizeType > & input );
+
+template< class StaticCrsGraphType , class InputSizeType >
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input );
+
+//----------------------------------------------------------------------------
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_StaticCrsGraph_factory.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class GraphType >
+struct StaticCrsGraphMaximumEntry {
+
+  typedef typename GraphType::execution_space execution_space ;
+  typedef typename GraphType::data_type value_type ;
+
+  const typename GraphType::entries_type entries ;
+
+  StaticCrsGraphMaximumEntry( const GraphType & graph ) : entries( graph.entries ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned i , value_type & update ) const
+    { if ( update < entries(i) ) update = entries(i); }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const
+    { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & update ,
+             volatile const value_type & input ) const
+    { if ( update < input ) update = input ; }
+};
+
+}
+
+template< class DataType, class Arg1Type, class Arg2Type, typename SizeType >
+DataType maximum_entry( const StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > & graph )
+{
+  typedef StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType> GraphType ;
+  typedef Impl::StaticCrsGraphMaximumEntry< GraphType > FunctorType ;
+
+  DataType result = 0 ;
+  Kokkos::parallel_reduce( graph.entries.extent(0),
+                           FunctorType(graph), result );
+  return result ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CRSARRAY_HPP */
+
diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..64601e6b5989678f6dcf24ef704f9e0ca9a6452a
--- /dev/null
+++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -0,0 +1,850 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_UnorderedMap.hpp
+/// \brief Declaration and definition of Kokkos::UnorderedMap.
+///
+/// This header file declares and defines Kokkos::UnorderedMap and its
+/// related nonmember functions.
+
+#ifndef KOKKOS_UNORDERED_MAP_HPP
+#define KOKKOS_UNORDERED_MAP_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Functional.hpp>
+
+#include <Kokkos_Bitset.hpp>
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_UnorderedMap_impl.hpp>
+
+
+#include <iostream>
+
+#include <cstdint>
+#include <stdexcept>
+
+
+namespace Kokkos {
+
+enum { UnorderedMapInvalidIndex = ~0u };
+
+/// \brief First element of the return value of UnorderedMap::insert().
+///
+/// Inserting an element into an UnorderedMap is not guaranteed to
+/// succeed.  There are three possible conditions:
+/// <ol>
+/// <li> <tt>INSERT_FAILED</tt>: The insert failed.  This usually
+///      means that the UnorderedMap ran out of space. </li>
+/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
+///      did <i>not</i> exist in the table before. </li>
+/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
+///      <i>did</i> exist in the table before.  The new value was
+///      ignored and the old value was left in place. </li>
+/// </ol>
+
+class UnorderedMapInsertResult
+{
+private:
+  enum Status{
+     SUCCESS = 1u << 31
+   , EXISTING = 1u << 30
+   , FREED_EXISTING = 1u << 29
+   , LIST_LENGTH_MASK = ~(SUCCESS | EXISTING | FREED_EXISTING)
+  };
+
+public:
+  /// Did the map successful insert the key/value pair
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool success() const { return (m_status & SUCCESS); }
+
+  /// Was the key already present in the map
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool existing() const { return (m_status & EXISTING); }
+
+  /// Did the map fail to insert the key due to insufficent capacity
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool failed() const { return m_index == UnorderedMapInvalidIndex; }
+
+  /// Did the map lose a race condition to insert a dupulicate key/value pair
+  /// where an index was claimed that needed to be released
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool freed_existing() const { return (m_status & FREED_EXISTING); }
+
+  /// How many iterations through the insert loop did it take before the
+  /// map returned
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t list_position() const { return (m_status & LIST_LENGTH_MASK); }
+
+  /// Index where the key can be found as long as the insert did not fail
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t index() const { return m_index; }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  UnorderedMapInsertResult()
+    : m_index(UnorderedMapInvalidIndex)
+    , m_status(0)
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void increment_list_position()
+  {
+    m_status += (list_position() < LIST_LENGTH_MASK) ? 1u : 0u;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void set_existing(uint32_t i, bool arg_freed_existing)
+  {
+    m_index = i;
+    m_status = EXISTING | (arg_freed_existing ? FREED_EXISTING : 0u) | list_position();
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void set_success(uint32_t i)
+  {
+    m_index = i;
+    m_status = SUCCESS | list_position();
+  }
+
+private:
+  uint32_t m_index;
+  uint32_t m_status;
+};
+
+/// \class UnorderedMap
+/// \brief Thread-safe, performance-portable lookup table.
+///
+/// This class provides a lookup table.  In terms of functionality,
+/// this class compares to std::unordered_map (new in C++11).
+/// "Unordered" means that keys are not stored in any particular
+/// order, unlike (for example) std::map.  "Thread-safe" means that
+/// lookups, insertion, and deletion are safe to call by multiple
+/// threads in parallel.  "Performance-portable" means that parallel
+/// performance of these operations is reasonable, on multiple
+/// hardware platforms.  Platforms on which performance has been
+/// tested include conventional Intel x86 multicore processors, Intel
+/// Xeon Phi ("MIC"), and NVIDIA GPUs.
+///
+/// Parallel performance portability entails design decisions that
+/// might differ from one's expectation for a sequential interface.
+/// This particularly affects insertion of single elements.  In an
+/// interface intended for sequential use, insertion might reallocate
+/// memory if the original allocation did not suffice to hold the new
+/// element.  In this class, insertion does <i>not</i> reallocate
+/// memory.  This means that it might fail.  insert() returns an enum
+/// which indicates whether the insert failed.  There are three
+/// possible conditions:
+/// <ol>
+/// <li> <tt>INSERT_FAILED</tt>: The insert failed.  This usually
+///      means that the UnorderedMap ran out of space. </li>
+/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
+///      did <i>not</i> exist in the table before. </li>
+/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
+///      <i>did</i> exist in the table before.  The new value was
+///      ignored and the old value was left in place. </li>
+/// </ol>
+///
+/// \tparam Key Type of keys of the lookup table.  If \c const, users
+///   are not allowed to add or remove keys, though they are allowed
+///   to change values.  In that case, the implementation may make
+///   optimizations specific to the <tt>Device</tt>.  For example, if
+///   <tt>Device</tt> is \c Cuda, it may use texture fetches to access
+///   keys.
+///
+/// \tparam Value Type of values stored in the lookup table.  You may use
+///   \c void here, in which case the table will be a set of keys.  If
+///   \c const, users are not allowed to change entries.
+///   In that case, the implementation may make
+///   optimizations specific to the \c Device, such as using texture
+///   fetches to access values.
+///
+/// \tparam Device The Kokkos Device type.
+///
+/// \tparam Hasher Definition of the hash function for instances of
+///   <tt>Key</tt>.  The default will calculate a bitwise hash.
+///
+/// \tparam EqualTo Definition of the equality function for instances of
+///   <tt>Key</tt>.  The default will do a bitwise equality comparison.
+///
+template <   typename Key
+           , typename Value
+           , typename Device = Kokkos::DefaultExecutionSpace
+           , typename Hasher = pod_hash<typename Impl::remove_const<Key>::type>
+           , typename EqualTo = pod_equal_to<typename Impl::remove_const<Key>::type>
+        >
+class UnorderedMap
+{
+private:
+  typedef typename ViewTraits<Key,Device,void,void>::host_mirror_space host_mirror_space ;
+public:
+  //! \name Public types and constants
+  //@{
+
+  //key_types
+  typedef Key declared_key_type;
+  typedef typename Impl::remove_const<declared_key_type>::type key_type;
+  typedef typename Impl::add_const<key_type>::type const_key_type;
+
+  //value_types
+  typedef Value declared_value_type;
+  typedef typename Impl::remove_const<declared_value_type>::type value_type;
+  typedef typename Impl::add_const<value_type>::type const_value_type;
+
+  typedef Device device_type;
+  typedef typename Device::execution_space execution_space;
+  typedef Hasher hasher_type;
+  typedef EqualTo  equal_to_type;
+  typedef uint32_t size_type;
+
+  //map_types
+  typedef UnorderedMap<declared_key_type,declared_value_type,device_type,hasher_type,equal_to_type> declared_map_type;
+  typedef UnorderedMap<key_type,value_type,device_type,hasher_type,equal_to_type>                   insertable_map_type;
+  typedef UnorderedMap<const_key_type,value_type,device_type,hasher_type,equal_to_type>             modifiable_map_type;
+  typedef UnorderedMap<const_key_type,const_value_type,device_type,hasher_type,equal_to_type>       const_map_type;
+
+  static const bool is_set = std::is_same<void,value_type>::value;
+  static const bool has_const_key = std::is_same<const_key_type,declared_key_type>::value;
+  static const bool has_const_value = is_set || std::is_same<const_value_type,declared_value_type>::value;
+
+  static const bool is_insertable_map = !has_const_key && (is_set || !has_const_value);
+  static const bool is_modifiable_map = has_const_key && !has_const_value;
+  static const bool is_const_map = has_const_key && has_const_value;
+
+
+  typedef UnorderedMapInsertResult insert_result;
+
+  typedef UnorderedMap<Key,Value,host_mirror_space,Hasher,EqualTo> HostMirror;
+
+  typedef Impl::UnorderedMapHistogram<const_map_type> histogram_type;
+
+  //@}
+
+private:
+  enum { invalid_index = ~static_cast<size_type>(0) };
+
+  typedef typename Impl::if_c< is_set, int, declared_value_type>::type impl_value_type;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , View< key_type *, device_type>
+                               , View< const key_type *, device_type, MemoryTraits<RandomAccess> >
+                             >::type key_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map || is_modifiable_map
+                               , View< impl_value_type *, device_type>
+                               , View< const impl_value_type *, device_type, MemoryTraits<RandomAccess> >
+                             >::type value_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , View< size_type *, device_type>
+                               , View< const size_type *, device_type, MemoryTraits<RandomAccess> >
+                             >::type size_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , Bitset< execution_space >
+                               , ConstBitset< execution_space>
+                             >::type bitset_type;
+
+  enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
+  enum { num_scalars = 3 };
+  typedef View< int[num_scalars], LayoutLeft, device_type> scalars_view;
+
+public:
+  //! \name Public member functions
+  //@{
+
+  UnorderedMap()
+    : m_bounded_insert()
+    , m_hasher()
+    , m_equal_to()
+    , m_size()
+    , m_available_indexes()
+    , m_hash_lists()
+    , m_next_index()
+    , m_keys()
+    , m_values()
+    , m_scalars()
+  {}
+
+  /// \brief Constructor
+  ///
+  /// \param capacity_hint [in] Initial guess of how many unique keys will be inserted into the map
+  /// \param hash [in] Hasher function for \c Key instances.  The
+  ///   default value usually suffices.
+  UnorderedMap(  size_type capacity_hint, hasher_type hasher = hasher_type(), equal_to_type equal_to = equal_to_type() )
+    : m_bounded_insert(true)
+    , m_hasher(hasher)
+    , m_equal_to(equal_to)
+    , m_size()
+    , m_available_indexes(calculate_capacity(capacity_hint))
+    , m_hash_lists(ViewAllocateWithoutInitializing("UnorderedMap hash list"), Impl::find_hash_size(capacity()))
+    , m_next_index(ViewAllocateWithoutInitializing("UnorderedMap next index"), capacity()+1) // +1 so that the *_at functions can always return a valid reference
+    , m_keys("UnorderedMap keys",capacity()+1)
+    , m_values("UnorderedMap values",(is_set? 1 : capacity()+1))
+    , m_scalars("UnorderedMap scalars")
+  {
+    if (!is_insertable_map) {
+      throw std::runtime_error("Cannot construct a non-insertable (i.e. const key_type) unordered_map");
+    }
+
+    Kokkos::deep_copy(m_hash_lists, invalid_index);
+    Kokkos::deep_copy(m_next_index, invalid_index);
+  }
+
+  void reset_failed_insert_flag()
+  {
+    reset_flag(failed_insert_idx);
+  }
+
+  histogram_type get_histogram()
+  {
+    return histogram_type(*this);
+  }
+
+  //! Clear all entries in the table.
+  void clear()
+  {
+    m_bounded_insert = true;
+
+    if (capacity() == 0) return;
+
+    m_available_indexes.clear();
+
+    Kokkos::deep_copy(m_hash_lists, invalid_index);
+    Kokkos::deep_copy(m_next_index, invalid_index);
+    {
+      const key_type tmp = key_type();
+      Kokkos::deep_copy(m_keys,tmp);
+    }
+    if (is_set){
+      const impl_value_type tmp = impl_value_type();
+      Kokkos::deep_copy(m_values,tmp);
+    }
+    {
+      Kokkos::deep_copy(m_scalars, 0);
+    }
+  }
+
+  /// \brief Change the capacity of the the map
+  ///
+  /// If there are no failed inserts the current size of the map will
+  /// be used as a lower bound for the input capacity.
+  /// If the map is not empty and does not have failed inserts
+  /// and the capacity changes then the current data is copied
+  /// into the resized / rehashed map.
+  ///
+  /// This is <i>not</i> a device function; it may <i>not</i> be
+  /// called in a parallel kernel.
+  bool rehash(size_type requested_capacity = 0)
+  {
+    const bool bounded_insert = (capacity() == 0) || (size() == 0u);
+    return rehash(requested_capacity, bounded_insert );
+  }
+
+  bool rehash(size_type requested_capacity, bool bounded_insert)
+  {
+    if(!is_insertable_map) return false;
+
+    const size_type curr_size = size();
+    requested_capacity = (requested_capacity < curr_size) ? curr_size : requested_capacity;
+
+    insertable_map_type tmp(requested_capacity, m_hasher, m_equal_to);
+
+    if (curr_size) {
+      tmp.m_bounded_insert = false;
+      Impl::UnorderedMapRehash<insertable_map_type> f(tmp,*this);
+      f.apply();
+    }
+    tmp.m_bounded_insert = bounded_insert;
+
+    *this = tmp;
+
+    return true;
+  }
+
+  /// \brief The number of entries in the table.
+  ///
+  /// This method has undefined behavior when erasable() is true.
+  ///
+  /// Note that this is not a device function; it cannot be called in
+  /// a parallel kernel.  The value is not stored as a variable; it
+  /// must be computed.
+  size_type size() const
+  {
+    if( capacity() == 0u ) return 0u;
+    if (modified()) {
+      m_size = m_available_indexes.count();
+      reset_flag(modified_idx);
+    }
+    return m_size;
+  }
+
+  /// \brief The current number of failed insert() calls.
+  ///
+  /// This is <i>not</i> a device function; it may <i>not</i> be
+  /// called in a parallel kernel.  The value is not stored as a
+  /// variable; it must be computed.
+  bool failed_insert() const
+  {
+    return get_flag(failed_insert_idx);
+  }
+
+  bool erasable() const
+  {
+    return is_insertable_map ? get_flag(erasable_idx) : false;
+  }
+
+  bool begin_erase()
+  {
+    bool result = !erasable();
+    if (is_insertable_map && result) {
+      execution_space::fence();
+      set_flag(erasable_idx);
+      execution_space::fence();
+    }
+    return result;
+  }
+
+  bool end_erase()
+  {
+    bool result = erasable();
+    if (is_insertable_map && result) {
+      execution_space::fence();
+      Impl::UnorderedMapErase<declared_map_type> f(*this);
+      f.apply();
+      execution_space::fence();
+      reset_flag(erasable_idx);
+    }
+    return result;
+  }
+
+  /// \brief The maximum number of entries that the table can hold.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type capacity() const
+  { return m_available_indexes.size(); }
+
+  /// \brief The number of hash table "buckets."
+  ///
+  /// This is different than the number of entries that the table can
+  /// hold.  Each key hashes to an index in [0, hash_capacity() - 1].
+  /// That index can hold zero or more entries.  This class decides
+  /// what hash_capacity() should be, given the user's upper bound on
+  /// the number of entries the table must be able to hold.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  size_type hash_capacity() const
+  { return m_hash_lists.extent(0); }
+
+  //---------------------------------------------------------------------------
+  //---------------------------------------------------------------------------
+
+
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.  As discussed in the class documentation, it need not
+  /// succeed.  The return value tells you if it did.
+  ///
+  /// \param k [in] The key to attempt to insert.
+  /// \param v [in] The corresponding value to attempt to insert.  If
+  ///   using this class as a set (with Value = void), then you need not
+  ///   provide this value.
+  KOKKOS_INLINE_FUNCTION
+  insert_result insert(key_type const& k, impl_value_type const&v = impl_value_type()) const
+  {
+    insert_result result;
+
+    if ( !is_insertable_map || capacity() == 0u || m_scalars((int)erasable_idx) ) {
+      return result;
+    }
+
+    if ( !m_scalars((int)modified_idx) ) {
+      m_scalars((int)modified_idx) = true;
+    }
+
+    int volatile & failed_insert_ref = m_scalars((int)failed_insert_idx) ;
+
+    const size_type hash_value = m_hasher(k);
+    const size_type hash_list = hash_value % m_hash_lists.extent(0);
+
+    size_type * curr_ptr   = & m_hash_lists[ hash_list ];
+    size_type new_index    = invalid_index ;
+
+    // Force integer multiply to long
+    size_type index_hint = static_cast<size_type>( (static_cast<double>(hash_list) * capacity()) / m_hash_lists.extent(0));
+
+    size_type find_attempts = 0;
+
+    enum : unsigned { bounded_find_attempts = 32u };
+    const size_type max_attempts = (m_bounded_insert && (bounded_find_attempts < m_available_indexes.max_hint()) ) ?
+                                    bounded_find_attempts :
+                                    m_available_indexes.max_hint();
+
+    bool not_done = true ;
+
+#if defined( __MIC__ )
+      #pragma noprefetch
+#endif
+    while ( not_done ) {
+
+      // Continue searching the unordered list for this key,
+      // list will only be appended during insert phase.
+      // Need volatile_load as other threads may be appending.
+      size_type curr = volatile_load(curr_ptr);
+
+      KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+#if defined( __MIC__ )
+      #pragma noprefetch
+#endif
+      while ( curr != invalid_index && ! m_equal_to( volatile_load(&m_keys[curr]), k) ) {
+        result.increment_list_position();
+        index_hint = curr;
+        curr_ptr = &m_next_index[curr];
+        curr = volatile_load(curr_ptr);
+        KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+      }
+
+      //------------------------------------------------------------
+      // If key already present then return that index.
+      if ( curr != invalid_index ) {
+
+        const bool free_existing = new_index != invalid_index;
+        if ( free_existing ) {
+          // Previously claimed an unused entry that was not inserted.
+          // Release this unused entry immediately.
+          if (!m_available_indexes.reset(new_index) ) {
+            printf("Unable to free existing\n");
+          }
+
+        }
+
+        result.set_existing(curr, free_existing);
+        not_done = false ;
+      }
+      //------------------------------------------------------------
+      // Key is not currently in the map.
+      // If the thread has claimed an entry try to insert now.
+      else {
+
+        //------------------------------------------------------------
+        // If have not already claimed an unused entry then do so now.
+        if (new_index == invalid_index) {
+
+          bool found = false;
+          // use the hash_list as the flag for the search direction
+          Kokkos::tie(found, index_hint) = m_available_indexes.find_any_unset_near( index_hint, hash_list );
+
+          // found and index and this thread set it
+          if ( !found && ++find_attempts >= max_attempts ) {
+            failed_insert_ref = true;
+            not_done = false ;
+          }
+          else if (m_available_indexes.set(index_hint) ) {
+            new_index = index_hint;
+            // Set key and value
+            KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_keys[new_index]);
+            m_keys[new_index] = k ;
+
+            if (!is_set) {
+              KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_values[new_index]);
+              m_values[new_index] = v ;
+            }
+
+            // Do not proceed until key and value are updated in global memory
+            memory_fence();
+          }
+        }
+        else if (failed_insert_ref) {
+          not_done = false;
+        }
+
+        // Attempt to append claimed entry into the list.
+        // Another thread may also be trying to append the same list so protect with atomic.
+        if ( new_index != invalid_index &&
+             curr ==  atomic_compare_exchange(curr_ptr, static_cast<size_type>(invalid_index), new_index) ) {
+          // Succeeded in appending
+          result.set_success(new_index);
+          not_done = false ;
+        }
+      }
+    } // while ( not_done )
+
+    return result ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool erase(key_type const& k) const
+  {
+    bool result = false;
+
+    if(is_insertable_map && 0u < capacity() && m_scalars((int)erasable_idx)) {
+
+      if ( ! m_scalars((int)modified_idx) ) {
+        m_scalars((int)modified_idx) = true;
+      }
+
+      size_type index = find(k);
+      if (valid_at(index)) {
+        m_available_indexes.reset(index);
+        result = true;
+      }
+    }
+
+    return result;
+  }
+
+  /// \brief Find the given key \c k, if it exists in the table.
+  ///
+  /// \return If the key exists in the table, the index of the
+  ///   value corresponding to that key; otherwise, an invalid index.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  size_type find( const key_type & k) const
+  {
+    size_type curr = 0u < capacity() ? m_hash_lists( m_hasher(k) % m_hash_lists.extent(0) ) : invalid_index ;
+
+    KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+    while (curr != invalid_index && !m_equal_to( m_keys[curr], k) ) {
+      KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+      curr = m_next_index[curr];
+    }
+
+    return curr;
+  }
+
+  /// \brief Does the key exist in the map
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  bool exists( const key_type & k) const
+  {
+    return valid_at(find(k));
+  }
+
+
+  /// \brief Get the value with \c i as its direct index.
+  ///
+  /// \param i [in] Index directly into the array of entries.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  ///
+  /// 'const value_type' via Cuda texture fetch must return by value.
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::if_c< (is_set || has_const_value), impl_value_type, impl_value_type &>::type
+  value_at(size_type i) const
+  {
+    return m_values[ is_set ? 0 : (i < capacity() ? i : capacity()) ];
+  }
+
+  /// \brief Get the key with \c i as its direct index.
+  ///
+  /// \param i [in] Index directly into the array of entries.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_FORCEINLINE_FUNCTION
+  key_type key_at(size_type i) const
+  {
+    return m_keys[ i < capacity() ? i : capacity() ];
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool valid_at(size_type i) const
+  {
+    return m_available_indexes.test(i);
+  }
+
+  template <typename SKey, typename SValue>
+  UnorderedMap( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src,
+                typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value,int>::type = 0
+              )
+    : m_bounded_insert(src.m_bounded_insert)
+    , m_hasher(src.m_hasher)
+    , m_equal_to(src.m_equal_to)
+    , m_size(src.m_size)
+    , m_available_indexes(src.m_available_indexes)
+    , m_hash_lists(src.m_hash_lists)
+    , m_next_index(src.m_next_index)
+    , m_keys(src.m_keys)
+    , m_values(src.m_values)
+    , m_scalars(src.m_scalars)
+  {}
+
+
+  template <typename SKey, typename SValue>
+  typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value
+                           ,declared_map_type & >::type
+  operator=( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src)
+  {
+    m_bounded_insert = src.m_bounded_insert;
+    m_hasher = src.m_hasher;
+    m_equal_to = src.m_equal_to;
+    m_size = src.m_size;
+    m_available_indexes = src.m_available_indexes;
+    m_hash_lists = src.m_hash_lists;
+    m_next_index = src.m_next_index;
+    m_keys = src.m_keys;
+    m_values = src.m_values;
+    m_scalars = src.m_scalars;
+    return *this;
+  }
+
+  template <typename SKey, typename SValue, typename SDevice>
+  typename Impl::enable_if< std::is_same< typename Impl::remove_const<SKey>::type, key_type>::value &&
+                            std::is_same< typename Impl::remove_const<SValue>::type, value_type>::value
+                          >::type
+  create_copy_view( UnorderedMap<SKey, SValue, SDevice, Hasher,EqualTo> const& src)
+  {
+    if (m_hash_lists.data() != src.m_hash_lists.data()) {
+
+      insertable_map_type tmp;
+
+      tmp.m_bounded_insert = src.m_bounded_insert;
+      tmp.m_hasher = src.m_hasher;
+      tmp.m_equal_to = src.m_equal_to;
+      tmp.m_size = src.size();
+      tmp.m_available_indexes = bitset_type( src.capacity() );
+      tmp.m_hash_lists        = size_type_view( ViewAllocateWithoutInitializing("UnorderedMap hash list"), src.m_hash_lists.extent(0) );
+      tmp.m_next_index        = size_type_view( ViewAllocateWithoutInitializing("UnorderedMap next index"), src.m_next_index.extent(0) );
+      tmp.m_keys              = key_type_view( ViewAllocateWithoutInitializing("UnorderedMap keys"), src.m_keys.extent(0) );
+      tmp.m_values            = value_type_view( ViewAllocateWithoutInitializing("UnorderedMap values"), src.m_values.extent(0) );
+      tmp.m_scalars           = scalars_view("UnorderedMap scalars");
+
+      Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);
+
+      typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, typename SDevice::memory_space > raw_deep_copy;
+
+      raw_deep_copy(tmp.m_hash_lists.data(), src.m_hash_lists.data(), sizeof(size_type)*src.m_hash_lists.extent(0));
+      raw_deep_copy(tmp.m_next_index.data(), src.m_next_index.data(), sizeof(size_type)*src.m_next_index.extent(0));
+      raw_deep_copy(tmp.m_keys.data(), src.m_keys.data(), sizeof(key_type)*src.m_keys.extent(0));
+      if (!is_set) {
+        raw_deep_copy(tmp.m_values.data(), src.m_values.data(), sizeof(impl_value_type)*src.m_values.extent(0));
+      }
+      raw_deep_copy(tmp.m_scalars.data(), src.m_scalars.data(), sizeof(int)*num_scalars );
+
+      *this = tmp;
+    }
+  }
+
+  //@}
+private: // private member functions
+
+  bool modified() const
+  {
+    return get_flag(modified_idx);
+  }
+
+  void set_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    const int true_ = true;
+    raw_deep_copy(m_scalars.data() + flag, &true_, sizeof(int));
+  }
+
+  void reset_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    const int false_ = false;
+    raw_deep_copy(m_scalars.data() + flag, &false_, sizeof(int));
+  }
+
+  bool get_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > raw_deep_copy;
+    int result = false;
+    raw_deep_copy(&result, m_scalars.data() + flag, sizeof(int));
+    return result;
+  }
+
+  static uint32_t calculate_capacity(uint32_t capacity_hint)
+  {
+    // increase by 16% and round to nears multiple of 128
+    return capacity_hint ? ((static_cast<uint32_t>(7ull*capacity_hint/6u) + 127u)/128u)*128u : 128u;
+  }
+
+private: // private members
+  bool              m_bounded_insert;
+  hasher_type       m_hasher;
+  equal_to_type     m_equal_to;
+  mutable size_type m_size;
+  bitset_type       m_available_indexes;
+  size_type_view    m_hash_lists;
+  size_type_view    m_next_index;
+  key_type_view     m_keys;
+  value_type_view   m_values;
+  scalars_view      m_scalars;
+
+  template <typename KKey, typename VValue, typename DDevice, typename HHash, typename EEqualTo>
+  friend class UnorderedMap;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapErase;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapHistogram;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapPrint;
+};
+
+// Specialization of deep_copy for two UnorderedMap objects.
+template <  typename DKey, typename DT, typename DDevice
+          , typename SKey, typename ST, typename SDevice
+          , typename Hasher, typename EqualTo >
+inline void deep_copy(         UnorderedMap<DKey, DT, DDevice, Hasher, EqualTo> & dst
+                       , const UnorderedMap<SKey, ST, SDevice, Hasher, EqualTo> & src )
+{
+  dst.create_copy_view(src);
+}
+
+
+} // namespace Kokkos
+
+#endif //KOKKOS_UNORDERED_MAP_HPP
+
diff --git a/packages/kokkos/containers/src/Kokkos_Vector.hpp b/packages/kokkos/containers/src/Kokkos_Vector.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..08eee5b17ada6c22b2cbe63a9a9f493f67a464b2
--- /dev/null
+++ b/packages/kokkos/containers/src/Kokkos_Vector.hpp
@@ -0,0 +1,292 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VECTOR_HPP
+#define KOKKOS_VECTOR_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_DualView.hpp>
+
+/* Drop in replacement for std::vector based on Kokkos::DualView
+ * Most functions only work on the host (it will not compile if called from device kernel)
+ *
+ */
+  namespace Kokkos {
+
+template< class Scalar, class Arg1Type = void>
+class vector : public DualView<Scalar*,LayoutLeft,Arg1Type> {
+
+public:
+  typedef Scalar value_type;
+  typedef Scalar* pointer;
+  typedef const Scalar* const_pointer;
+  typedef Scalar& reference;
+  typedef const Scalar& const_reference;
+  typedef Scalar* iterator;
+  typedef const Scalar* const_iterator;
+
+private:
+  size_t _size;
+  typedef size_t size_type;
+  float _extra_storage;
+  typedef DualView<Scalar*,LayoutLeft,Arg1Type> DV;
+
+
+public:
+#ifdef KOKKOS_ENABLE_CUDA_UVM
+  KOKKOS_INLINE_FUNCTION reference operator() (int i) const {return DV::h_view(i);};
+  KOKKOS_INLINE_FUNCTION reference operator[] (int i) const {return DV::h_view(i);};
+#else
+  inline reference operator() (int i) const {return DV::h_view(i);};
+  inline reference operator[] (int i) const {return DV::h_view(i);};
+#endif
+
+  /* Member functions which behave like std::vector functions */
+
+  vector():DV() {
+    _size = 0;
+    _extra_storage = 1.1;
+    DV::modified_host() = 1;
+  }
+
+
+  vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Arg1Type>("Vector",size_t(n*(1.1))) {
+    _size = n;
+    _extra_storage = 1.1;
+    DV::modified_host() = 1;
+
+    assign(n,val);
+  }
+
+
+  void resize(size_t n) {
+    if(n>=capacity())
+      DV::resize(size_t (n*_extra_storage));
+    _size = n;
+  }
+
+  void resize(size_t n, const Scalar& val) {
+    assign(n,val);
+  }
+
+  void assign (size_t n, const Scalar& val) {
+
+    /* Resize if necessary (behavour of std:vector) */
+
+    if(n>capacity())
+      DV::resize(size_t (n*_extra_storage));
+    _size = n;
+
+          /* Assign value either on host or on device */
+
+    if( DV::modified_host() >= DV::modified_device() ) {
+      set_functor_host f(DV::h_view,val);
+      parallel_for(n,f);
+      DV::t_host::execution_space::fence();
+      DV::modified_host()++;
+    } else {
+      set_functor f(DV::d_view,val);
+      parallel_for(n,f);
+      DV::t_dev::execution_space::fence();
+      DV::modified_device()++;
+    }
+  }
+
+  void reserve(size_t n) {
+    DV::resize(size_t (n*_extra_storage));
+  }
+
+  void push_back(Scalar val) {
+    DV::modified_host()++;
+    if(_size == capacity()) {
+      size_t new_size = _size*_extra_storage;
+      if(new_size == _size) new_size++;
+      DV::resize(new_size);
+    }
+
+    DV::h_view(_size) = val;
+    _size++;
+
+  }
+
+  void pop_back() {
+    _size--;
+  }
+
+  void clear() {
+    _size = 0;
+  }
+
+  size_type size() const {return _size;}
+  size_type max_size() const {return 2000000000;}
+  size_type capacity() const {return DV::capacity();}
+  bool empty() const {return _size==0;}
+
+  iterator begin() const {return &DV::h_view(0);}
+
+  iterator end() const {return &DV::h_view(_size);}
+
+  reference front() {return DV::h_view(0);}
+
+  reference back() {return DV::h_view(_size - 1);}
+
+  const_reference front() const {return DV::h_view(0);}
+
+  const_reference back() const {return DV::h_view(_size - 1);}
+
+  /* std::algorithms wich work originally with iterators, here they are implemented as member functions */
+
+  size_t
+  lower_bound (const size_t& start,
+               const size_t& theEnd,
+               const Scalar& comp_val) const
+  {
+    int lower = start; // FIXME (mfh 24 Apr 2014) narrowing conversion
+    int upper = _size > theEnd? theEnd : _size-1; // FIXME (mfh 24 Apr 2014) narrowing conversion
+    if (upper <= lower) {
+      return theEnd;
+    }
+
+    Scalar lower_val = DV::h_view(lower);
+    Scalar upper_val = DV::h_view(upper);
+    size_t idx = (upper+lower)/2;
+    Scalar val = DV::h_view(idx);
+    if(val>upper_val) return upper;
+    if(val<lower_val) return start;
+
+    while(upper>lower) {
+      if(comp_val>val) {
+        lower = ++idx;
+      } else {
+        upper = idx;
+      }
+      idx = (upper+lower)/2;
+      val = DV::h_view(idx);
+    }
+    return idx;
+  }
+
+  bool is_sorted() {
+    for(int i=0;i<_size-1;i++) {
+      if(DV::h_view(i)>DV::h_view(i+1)) return false;
+    }
+    return true;
+  }
+
+  iterator find(Scalar val) const {
+    if(_size == 0) return end();
+
+    int upper,lower,current;
+    current = _size/2;
+    upper = _size-1;
+    lower = 0;
+
+    if((val<DV::h_view(0)) || (val>DV::h_view(_size-1)) ) return end();
+
+    while(upper>lower)
+    {
+      if(val>DV::h_view(current)) lower = current+1;
+      else upper = current;
+      current = (upper+lower)/2;
+    }
+
+    if(val==DV::h_view(current)) return &DV::h_view(current);
+    else return end();
+  }
+
+  /* Additional functions for data management */
+
+  void device_to_host(){
+    deep_copy(DV::h_view,DV::d_view);
+  }
+  void host_to_device() const {
+    deep_copy(DV::d_view,DV::h_view);
+  }
+
+  void on_host() {
+    DV::modified_host() = DV::modified_device() + 1;
+  }
+  void on_device() {
+    DV::modified_device() = DV::modified_host() + 1;
+  }
+
+  void set_overallocation(float extra) {
+    _extra_storage = 1.0 + extra;
+  }
+
+
+public:
+  struct set_functor {
+    typedef typename DV::t_dev::execution_space execution_space;
+    typename DV::t_dev _data;
+    Scalar _val;
+
+    set_functor(typename DV::t_dev data, Scalar val) :
+      _data(data),_val(val) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int &i) const {
+      _data(i) = _val;
+    }
+  };
+
+  struct set_functor_host {
+    typedef typename DV::t_host::execution_space execution_space;
+    typename DV::t_host _data;
+    Scalar _val;
+
+    set_functor_host(typename DV::t_host data, Scalar val) :
+      _data(data),_val(val) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int &i) const {
+      _data(i) = _val;
+    }
+  };
+
+};
+
+
+}
+#endif
+
diff --git a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ea2e102cec501bd88a3d872c280c84380aedce0
--- /dev/null
+++ b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BITSET_IMPL_HPP
+#define KOKKOS_BITSET_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+#include <cstdint>
+
+#include <cstdio>
+#include <climits>
+#include <iostream>
+#include <iomanip>
+
+namespace Kokkos {
+namespace Impl {
+
+KOKKOS_FORCEINLINE_FUNCTION
+unsigned rotate_right( unsigned i, int r )
+{
+  enum { size = static_cast<int>( sizeof(unsigned) * CHAR_BIT ) };
+  return r ? ( ( i >> r ) | ( i << ( size - r ) ) ) : i ;
+}
+
+template < typename Bitset >
+struct BitsetCount
+{
+  typedef Bitset                                                  bitset_type;
+  typedef typename bitset_type::execution_space::execution_space  execution_space;
+  typedef typename bitset_type::size_type                         size_type;
+  typedef size_type                                               value_type;
+
+  bitset_type m_bitset;
+
+  BitsetCount( bitset_type const& bitset )
+    : m_bitset(bitset)
+  {}
+
+  size_type apply() const
+  {
+    size_type count = 0u;
+    parallel_reduce( m_bitset.m_blocks.extent(0), *this, count );
+    return count;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & count ) const
+  {
+    count = 0u;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & count, const volatile size_type & incr ) const
+  {
+    count += incr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i, value_type & count ) const
+  {
+    count += bit_count( m_bitset.m_blocks[i] );
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif // KOKKOS_BITSET_IMPL_HPP
diff --git a/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b17dcce4b487aa371bfa1a311f562d7ad402c24
--- /dev/null
+++ b/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
@@ -0,0 +1,196 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_FUNCTIONAL_IMPL_HPP
+#define KOKKOS_FUNCTIONAL_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cstdint>
+
+namespace Kokkos { namespace Impl {
+
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t getblock32 ( const uint8_t * p, int i )
+{
+// used to avoid aliasing error which could cause errors with
+// forced inlining
+  return    ((uint32_t)p[i*4+0])
+          | ((uint32_t)p[i*4+1] << 8)
+          | ((uint32_t)p[i*4+2] << 16)
+          | ((uint32_t)p[i*4+3] << 24);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t rotl32 ( uint32_t x, int8_t r )
+{ return (x << r) | (x >> (32 - r)); }
+
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t fmix32 ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+KOKKOS_INLINE_FUNCTION
+uint32_t MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  for(int i=0; i<nblocks; ++i)
+  {
+    uint32_t k1 = getblock32(data,i);
+
+    k1 *= c1;
+    k1 = rotl32(k1,15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = rotl32(h1,13);
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  return h1;
+}
+
+
+#if defined( __GNUC__ ) /* GNU C   */ || \
+    defined( __GNUG__ ) /* GNU C++ */ || \
+    defined( __clang__ )
+
+#define KOKKOS_IMPL_MAY_ALIAS __attribute__((__may_alias__))
+
+#else
+
+#define KOKKOS_IMPL_MAY_ALIAS
+
+#endif
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+bool bitwise_equal(T const * const a_ptr, T const * const b_ptr)
+{
+  typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_IMPL_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const * const ptr;
+    T64 const * const ptr64;
+    T32 const * const ptr32;
+    T16 const * const ptr16;
+    T8  const * const ptr8;
+  } a = {a_ptr}, b = {b_ptr};
+
+  bool result = true;
+
+  for (int i=0; i < NUM_64; ++i) {
+    result = result && a.ptr64[i] == b.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    result = result && a.ptr32[NUM_64*2] == b.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    result = result && a.ptr16[NUM_32*2] == b.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    result = result && a.ptr8[NUM_16*2] == b.ptr8[NUM_16*2];
+  }
+
+  return result;
+}
+
+
+
+#undef KOKKOS_IMPL_MAY_ALIAS
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_FUNCTIONAL_IMPL_HPP
+
diff --git a/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..91fbaaf0113e51883b63bf569d31649f22f42ee5
--- /dev/null
+++ b/packages/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
@@ -0,0 +1,210 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
+#define KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return view ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view )
+{
+  // Force copy:
+  //typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
+  typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >  staticcrsgraph_type ;
+
+  typename staticcrsgraph_type::HostMirror               tmp ;
+  typename staticcrsgraph_type::row_map_type::HostMirror tmp_row_map = create_mirror( view.row_map);
+  typename staticcrsgraph_type::row_block_type::HostMirror tmp_row_block_offsets = create_mirror( view.row_block_offsets);
+
+  // Allocation to match:
+  tmp.row_map = tmp_row_map ; // Assignment of 'const' from 'non-const'
+  tmp.entries = create_mirror( view.entries );
+  tmp.row_block_offsets = tmp_row_block_offsets ; // Assignment of 'const' from 'non-const'
+
+  // Deep copy:
+  deep_copy( tmp_row_map , view.row_map );
+  deep_copy( tmp.entries , view.entries );
+  deep_copy( tmp_row_block_offsets , view.row_block_offsets );
+
+  return tmp ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ! ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return create_mirror( view );
+}
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class StaticCrsGraphType , class InputSizeType >
+inline
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< InputSizeType > & input )
+{
+  typedef StaticCrsGraphType                  output_type ;
+  //typedef std::vector< InputSizeType >  input_type ; // unused
+
+  typedef typename output_type::entries_type   entries_type ;
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::execution_space > work_type ;
+
+  output_type output ;
+
+  // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i];
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  return output ;
+}
+
+//----------------------------------------------------------------------------
+
+template< class StaticCrsGraphType , class InputSizeType >
+inline
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input )
+{
+  typedef StaticCrsGraphType                  output_type ;
+  typedef typename output_type::entries_type  entries_type ;
+
+  static_assert( entries_type::rank == 1
+               , "Graph entries view must be rank one" );
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::execution_space > work_type ;
+
+  output_type output ;
+
+    // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i].size();
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  // Fill in the entries:
+  {
+    typename entries_type::HostMirror host_entries =
+      create_mirror_view( output.entries );
+
+    size_t sum = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      for ( size_t j = 0 ; j < input[i].size() ; ++j , ++sum ) {
+        host_entries( sum ) = input[i][j] ;
+      }
+    }
+
+    deep_copy( output.entries , host_entries );
+  }
+
+  return output ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */
+
diff --git a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..88ca200daf3004fb80d08c19a304ac8e7dfef9e7
--- /dev/null
+++ b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
@@ -0,0 +1,101 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_UnorderedMap.hpp>
+
+namespace Kokkos { namespace Impl {
+
+uint32_t find_hash_size(uint32_t size)
+{
+  if (size == 0u) return 0u;
+
+  // these primes try to preserve randomness of hash
+  static const uint32_t primes [] = {
+        3, 7, 13, 23, 53, 97, 193, 389, 769, 1543
+      , 2237, 2423, 2617, 2797, 2999, 3167, 3359, 3539
+      , 3727, 3911, 4441 , 4787 , 5119 , 5471 , 5801 , 6143 , 6521 , 6827
+      , 7177 , 7517 , 7853 , 8887 , 9587 , 10243 , 10937 , 11617 , 12289
+      , 12967 , 13649 , 14341 , 15013 , 15727
+      , 17749 , 19121 , 20479 , 21859 , 23209 , 24593 , 25939 , 27329
+      , 28669 , 30047 , 31469 , 35507 , 38231 , 40961 , 43711 , 46439
+      , 49157 , 51893 , 54617 , 57347 , 60077 , 62801 , 70583 , 75619
+      , 80669 , 85703 , 90749 , 95783 , 100823 , 105871 , 110909 , 115963
+      , 120997 , 126031 , 141157 , 151237 , 161323 , 171401 , 181499 , 191579
+      , 201653 , 211741 , 221813 , 231893 , 241979 , 252079
+      , 282311 , 302483 , 322649 , 342803 , 362969 , 383143 , 403301 , 423457
+      , 443629 , 463787 , 483953 , 504121 , 564617 , 604949 , 645313 , 685609
+      , 725939 , 766273 , 806609 , 846931 , 887261 , 927587 , 967919 , 1008239
+      , 1123477 , 1198397 , 1273289 , 1348177 , 1423067 , 1497983 , 1572869
+      , 1647761 , 1722667 , 1797581 , 1872461 , 1947359 , 2022253
+      , 2246953 , 2396759 , 2546543 , 2696363 , 2846161 , 2995973 , 3145739
+      , 3295541 , 3445357 , 3595117 , 3744941 , 3894707 , 4044503
+      , 4493921 , 4793501 , 5093089 , 5392679 , 5692279 , 5991883 , 6291469
+      , 6591059 , 6890641 , 7190243 , 7489829 , 7789447 , 8089033
+      , 8987807 , 9586981 , 10186177 , 10785371 , 11384539 , 11983729
+      , 12582917 , 13182109 , 13781291 , 14380469 , 14979667 , 15578861
+      , 16178053 , 17895707 , 19014187 , 20132683 , 21251141 , 22369661
+      , 23488103 , 24606583 , 25725083 , 26843549 , 27962027 , 29080529
+      , 30198989 , 31317469 , 32435981 , 35791397 , 38028379 , 40265327
+      , 42502283 , 44739259 , 46976221 , 49213237 , 51450131 , 53687099
+      , 55924061 , 58161041 , 60397993 , 62634959 , 64871921
+      , 71582857 , 76056727 , 80530643 , 85004567 , 89478503 , 93952427
+      , 98426347 , 102900263 , 107374217 , 111848111 , 116322053 , 120795971
+      , 125269877 , 129743807 , 143165587 , 152113427 , 161061283 , 170009141
+      , 178956983 , 187904819 , 196852693 , 205800547 , 214748383 , 223696237
+      , 232644089 , 241591943 , 250539763 , 259487603 , 268435399
+  };
+
+  const uint32_t num_primes = sizeof(primes)/sizeof(uint32_t);
+
+  uint32_t hsize = primes[num_primes-1] ;
+  for (uint32_t i = 0; i < num_primes; ++i) {
+    if (size <= primes[i]) {
+      hsize = primes[i];
+      break;
+    }
+  }
+  return hsize;
+}
+
+}} // namespace Kokkos::Impl
+
diff --git a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f97c1fc0468f7049b03770749ae678270d812e1d
--- /dev/null
+++ b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
@@ -0,0 +1,298 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_UNORDERED_MAP_IMPL_HPP
+#define KOKKOS_UNORDERED_MAP_IMPL_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <cstdint>
+
+#include <cstdio>
+#include <climits>
+#include <iostream>
+#include <iomanip>
+
+namespace Kokkos { namespace Impl {
+
+uint32_t find_hash_size( uint32_t size );
+
+template <typename Map>
+struct UnorderedMapRehash
+{
+  typedef Map map_type;
+  typedef typename map_type::const_map_type const_map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+
+  map_type       m_dst;
+  const_map_type m_src;
+
+  UnorderedMapRehash( map_type const& dst, const_map_type const& src)
+    : m_dst(dst), m_src(src)
+  {}
+
+  void apply() const
+  {
+    parallel_for(m_src.capacity(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+    if ( m_src.valid_at(i) )
+      m_dst.insert(m_src.key_at(i), m_src.value_at(i));
+  }
+
+};
+
+template <typename UMap>
+struct UnorderedMapErase
+{
+  typedef UMap map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+  typedef typename map_type::key_type key_type;
+  typedef typename map_type::impl_value_type value_type;
+
+  map_type m_map;
+
+  UnorderedMapErase( map_type const& map)
+    : m_map(map)
+  {}
+
+  void apply() const
+  {
+    parallel_for(m_map.m_hash_lists.extent(0), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    size_type curr = m_map.m_hash_lists(i);
+    size_type next = invalid_index;
+
+    // remove erased head of the linked-list
+    while (curr != invalid_index && !m_map.valid_at(curr)) {
+      next = m_map.m_next_index[curr];
+      m_map.m_next_index[curr] = invalid_index;
+      m_map.m_keys[curr] = key_type();
+      if (m_map.is_set) m_map.m_values[curr] = value_type();
+      curr = next;
+      m_map.m_hash_lists(i) = next;
+    }
+
+    // if the list is non-empty and the head is valid
+    if (curr != invalid_index && m_map.valid_at(curr) ) {
+      size_type prev = curr;
+      curr = m_map.m_next_index[prev];
+
+      while (curr != invalid_index) {
+        next = m_map.m_next_index[curr];
+        if (m_map.valid_at(curr)) {
+          prev = curr;
+        }
+        else {
+          // remove curr from list
+          m_map.m_next_index[prev] = next;
+          m_map.m_next_index[curr] = invalid_index;
+          m_map.m_keys[curr] = key_type();
+          if (map_type::is_set) m_map.m_values[curr] = value_type();
+        }
+        curr = next;
+      }
+    }
+  }
+};
+
+template <typename UMap>
+struct UnorderedMapHistogram
+{
+  typedef UMap map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+
+  typedef View<int[100], execution_space> histogram_view;
+  typedef typename histogram_view::HostMirror host_histogram_view;
+
+  map_type m_map;
+  histogram_view m_length;
+  histogram_view m_distance;
+  histogram_view m_block_distance;
+
+  UnorderedMapHistogram( map_type const& map)
+    : m_map(map)
+    , m_length("UnorderedMap Histogram")
+    , m_distance("UnorderedMap Histogram")
+    , m_block_distance("UnorderedMap Histogram")
+  {}
+
+  void calculate()
+  {
+    parallel_for(m_map.m_hash_lists.extent(0), *this);
+  }
+
+  void clear()
+  {
+    Kokkos::deep_copy(m_length, 0);
+    Kokkos::deep_copy(m_distance, 0);
+    Kokkos::deep_copy(m_block_distance, 0);
+  }
+
+  void print_length(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_length);
+    Kokkos::deep_copy(host_copy, m_length);
+
+    for (int i=0, size = host_copy.extent(0); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  void print_distance(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_distance);
+    Kokkos::deep_copy(host_copy, m_distance);
+
+    for (int i=0, size = host_copy.extent(0); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  void print_block_distance(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_block_distance);
+    Kokkos::deep_copy(host_copy, m_block_distance);
+
+    for (int i=0, size = host_copy.extent(0); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    uint32_t length = 0;
+    size_type min_index = ~0u, max_index = 0;
+    for (size_type curr = m_map.m_hash_lists(i); curr != invalid_index; curr = m_map.m_next_index[curr]) {
+      ++length;
+      min_index = (curr < min_index) ? curr : min_index;
+      max_index = (max_index < curr) ? curr : max_index;
+    }
+
+    size_type distance = (0u < length) ? max_index - min_index : 0u;
+    size_type blocks = (0u < length) ? max_index/32u - min_index/32u : 0u;
+
+    // normalize data
+    length   = length   < 100u ? length   : 99u;
+    distance = distance < 100u ? distance : 99u;
+    blocks   = blocks   < 100u ? blocks   : 99u;
+
+    if (0u < length)
+    {
+      atomic_fetch_add( &m_length(length), 1);
+      atomic_fetch_add( &m_distance(distance), 1);
+      atomic_fetch_add( &m_block_distance(blocks), 1);
+    }
+  }
+};
+
+template <typename UMap>
+struct UnorderedMapPrint
+{
+  typedef UMap map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+
+  map_type m_map;
+
+  UnorderedMapPrint( map_type const& map)
+    : m_map(map)
+  {}
+
+  void apply()
+  {
+    parallel_for(m_map.m_hash_lists.extent(0), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    uint32_t list = m_map.m_hash_lists(i);
+    for (size_type curr = list, ii=0; curr != invalid_index; curr = m_map.m_next_index[curr], ++ii) {
+      printf("%d[%d]: %d->%d\n", list, ii, m_map.key_at(curr), m_map.value_at(curr));
+    }
+  }
+};
+
+template <typename DKey, typename DValue, typename SKey, typename SValue>
+struct UnorderedMapCanAssign : public false_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<Key,Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,const Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,const Value,const Key,Value> : public true_ {};
+
+
+}} //Kokkos::Impl
+
+#endif // KOKKOS_UNORDERED_MAP_IMPL_HPP
+
diff --git a/packages/kokkos/containers/unit_tests/CMakeLists.txt b/packages/kokkos/containers/unit_tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1162d2a6ba00e5c5a7727990e6359392310b0add
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/CMakeLists.txt
@@ -0,0 +1,57 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
+
+IF(NOT KOKKOS_HAS_TRILINOS)
+  IF(KOKKOS_SEPARATE_LIBS)
+    set(TEST_LINK_TARGETS kokkoscore)
+  ELSE()
+    set(TEST_LINK_TARGETS kokkos)
+  ENDIF()
+ENDIF()
+
+IF(Kokkos_ENABLE_Pthread)
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_Threads
+  SOURCES TestThreads.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Serial)
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_Serial
+  SOURCES TestSerial.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_OpenMP)
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_OpenMP
+  SOURCES TestOpenMP.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Cuda)
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_Cuda
+  SOURCES TestCuda.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+ENDIF()
+
diff --git a/packages/kokkos/containers/unit_tests/Makefile b/packages/kokkos/containers/unit_tests/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2b6861f6d7ff905d1789a70fb8903be7787cf4ba
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/Makefile
@@ -0,0 +1,101 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests
+
+default: build_all
+	echo "End Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+else
+  CXX = g++
+endif
+
+CXXFLAGS = -O3
+LINK ?= $(CXX)
+LDFLAGS ?=
+override LDFLAGS += -lpthread
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests
+
+TEST_TARGETS =
+TARGETS =
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_Cuda
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+	OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_ROCm
+	TEST_TARGETS += test-rocm
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_Threads
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = TestOpenMP.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_OpenMP
+	TEST_TARGETS += test-openmp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+	OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_Serial
+	TEST_TARGETS += test-serial
+endif
+
+KokkosContainers_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Cuda
+
+KokkosContainers_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_ROCm
+
+KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Threads
+
+KokkosContainers_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_OpenMP
+
+KokkosContainers_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Serial
+
+test-cuda: KokkosContainers_UnitTest_Cuda
+	./KokkosContainers_UnitTest_Cuda
+
+test-rocm: KokkosContainers_UnitTest_ROCm
+	./KokkosContainers_UnitTest_ROCm
+
+test-threads: KokkosContainers_UnitTest_Threads
+	./KokkosContainers_UnitTest_Threads
+
+test-openmp: KokkosContainers_UnitTest_OpenMP
+	./KokkosContainers_UnitTest_OpenMP
+
+test-serial: KokkosContainers_UnitTest_Serial
+	./KokkosContainers_UnitTest_Serial
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
diff --git a/packages/kokkos/containers/unit_tests/TestBitset.hpp b/packages/kokkos/containers/unit_tests/TestBitset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd83835250ad937a31e56c2ca0111bde4d0852b7
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestBitset.hpp
@@ -0,0 +1,285 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_BITSET_HPP
+#define KOKKOS_TEST_BITSET_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+
+namespace Test {
+
+namespace Impl {
+
+template <typename Bitset, bool Set>
+struct TestBitset
+{
+  typedef Bitset bitset_type;
+  typedef typename bitset_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  bitset_type m_bitset;
+
+  TestBitset( bitset_type const& bitset)
+    : m_bitset(bitset)
+  {}
+
+  unsigned testit(unsigned collisions)
+  {
+    execution_space::fence();
+
+    unsigned count = 0;
+    Kokkos::parallel_reduce( m_bitset.size()*collisions, *this, count);
+    return count;
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    i = i % m_bitset.size();
+    if (Set) {
+      if (m_bitset.set(i)) {
+        if (m_bitset.test(i)) ++v;
+      }
+    }
+    else {
+      if (m_bitset.reset(i)) {
+        if (!m_bitset.test(i)) ++v;
+      }
+    }
+  }
+
+};
+
+template <typename Bitset>
+struct TestBitsetTest
+{
+  typedef Bitset bitset_type;
+  typedef typename bitset_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  bitset_type m_bitset;
+
+  TestBitsetTest( bitset_type const& bitset)
+    : m_bitset(bitset)
+  {}
+
+  unsigned testit()
+  {
+    execution_space::fence();
+
+    unsigned count = 0;
+    Kokkos::parallel_reduce( m_bitset.size(), *this, count);
+    return count;
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    if (m_bitset.test( i )) ++v;
+  }
+};
+
+template <typename Bitset, bool Set>
+struct TestBitsetAny
+{
+  typedef Bitset bitset_type;
+  typedef typename bitset_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  bitset_type m_bitset;
+
+  TestBitsetAny( bitset_type const& bitset)
+    : m_bitset(bitset)
+  {}
+
+  unsigned testit()
+  {
+    execution_space::fence();
+
+    unsigned count = 0;
+    Kokkos::parallel_reduce( m_bitset.size(), *this, count);
+    return count;
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    bool result = false;
+    unsigned attempts = 0;
+    uint32_t hint = (i >> 4) << 4;
+    while (attempts < m_bitset.max_hint()) {
+      if (Set) {
+        Kokkos::tie(result, hint) = m_bitset.find_any_unset_near(hint, i);
+        if (result && m_bitset.set(hint)) {
+          ++v;
+          break;
+        }
+        else if (!result) {
+          ++attempts;
+        }
+      }
+      else {
+        Kokkos::tie(result, hint) = m_bitset.find_any_set_near(hint, i);
+        if (result && m_bitset.reset(hint)) {
+          ++v;
+          break;
+        }
+        else if (!result) {
+          ++attempts;
+        }
+      }
+    }
+  }
+
+};
+} // namespace Impl
+
+
+
+template <typename Device>
+void test_bitset()
+{
+  typedef Kokkos::Bitset< Device > bitset_type;
+  typedef Kokkos::ConstBitset< Device > const_bitset_type;
+
+  //unsigned test_sizes[] = { 0u, 1000u, 1u<<14, 1u<<16, 10000001 };
+  unsigned test_sizes[] = { 1000u, 1u<<14, 1u<<16, 10000001 };
+
+  for (int i=0, end = sizeof(test_sizes)/sizeof(unsigned); i<end; ++i) {
+
+    //std::cout << "Bitset " << test_sizes[i] << std::endl;
+
+    bitset_type bitset(test_sizes[i]);
+
+    //std::cout << "  Check inital count " << std::endl;
+    // nothing should be set
+    {
+      Impl::TestBitsetTest< bitset_type > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ(0u, count);
+      EXPECT_EQ(count, bitset.count());
+    }
+
+    //std::cout << "  Check set() " << std::endl;
+    bitset.set();
+    // everything should be set
+    {
+      Impl::TestBitsetTest< const_bitset_type > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ(bitset.size(), count);
+      EXPECT_EQ(count, bitset.count());
+    }
+
+    //std::cout << "  Check reset() " << std::endl;
+    bitset.reset();
+    EXPECT_EQ(0u, bitset.count());
+
+    //std::cout << "  Check set(i) " << std::endl;
+    // test setting bits
+    {
+      Impl::TestBitset< bitset_type, true > f(bitset);
+      uint32_t count = f.testit(10u);
+      EXPECT_EQ( bitset.size(), bitset.count());
+      EXPECT_EQ( bitset.size(), count );
+    }
+
+    //std::cout << "  Check reset(i) " << std::endl;
+    // test resetting bits
+    {
+      Impl::TestBitset< bitset_type, false > f(bitset);
+      uint32_t count = f.testit(10u);
+      EXPECT_EQ( bitset.size(), count);
+      EXPECT_EQ( 0u, bitset.count() );
+    }
+
+
+    //std::cout << "  Check find_any_set(i) " << std::endl;
+    // test setting any bits
+    {
+      Impl::TestBitsetAny< bitset_type, true > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ( bitset.size(), bitset.count());
+      EXPECT_EQ( bitset.size(), count );
+    }
+
+    //std::cout << "  Check find_any_unset(i) " << std::endl;
+    // test resetting any bits
+    {
+      Impl::TestBitsetAny< bitset_type, false > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ( bitset.size(), count);
+      EXPECT_EQ( 0u, bitset.count() );
+    }
+
+  }
+
+}
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_BITSET_HPP
+
diff --git a/packages/kokkos/containers/unit_tests/TestCuda.cpp b/packages/kokkos/containers/unit_tests/TestCuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83498daef57724ddb433bad8f08a6c90bf4282cf
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestCuda.cpp
@@ -0,0 +1,256 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <iostream>
+#include <iomanip>
+#include <cstdint>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_Vector.hpp>
+
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestDynamicView.hpp>
+#include <TestScatterView.hpp>
+
+#include <Kokkos_DynRankView.hpp>
+#include <TestDynViewAPI.hpp>
+
+#include <Kokkos_ErrorReporter.hpp>
+#include <TestErrorReporter.hpp>
+
+#include <TestViewCtorPropEmbeddedDim.hpp>
+
+//----------------------------------------------------------------------------
+
+
+
+namespace Test {
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+  }
+  static void TearDownTestCase()
+  {
+  }
+};
+
+TEST_F( cuda , dyn_view_api) {
+  TestDynViewAPI< double , Kokkos::Cuda >();
+}
+
+TEST_F( cuda, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Cuda >::test_vcpt( 2, 3 );
+}
+
+TEST_F( cuda , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::Cuda >();
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(1, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(1, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(1, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(1, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(3, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(3, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(3, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(3, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(75, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(75, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(75, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Cuda >(75, 100000);
+}
+
+
+void cuda_test_insert_close(  uint32_t num_nodes
+                            , uint32_t num_inserts
+                            , uint32_t num_duplicates
+                           )
+{
+  test_insert< Kokkos::Cuda >( num_nodes, num_inserts, num_duplicates, true);
+}
+
+void cuda_test_insert_far(  uint32_t num_nodes
+                          , uint32_t num_inserts
+                          , uint32_t num_duplicates
+                         )
+{
+  test_insert< Kokkos::Cuda >( num_nodes, num_inserts, num_duplicates, false);
+}
+
+void cuda_test_failed_insert(  uint32_t num_nodes )
+{
+  test_failed_insert< Kokkos::Cuda >( num_nodes );
+}
+
+void cuda_test_deep_copy(  uint32_t num_nodes )
+{
+  test_deep_copy< Kokkos::Cuda >( num_nodes );
+}
+
+void cuda_test_vector_combinations(unsigned int size)
+{
+  test_vector_combinations<int,Kokkos::Cuda>(size);
+}
+
+void cuda_test_dualview_combinations(unsigned int size)
+{
+  test_dualview_combinations<int,Kokkos::Cuda>(size);
+}
+
+void cuda_test_bitset()
+{
+  test_bitset<Kokkos::Cuda>();
+}
+
+
+
+/*TEST_F( cuda, bitset )
+{
+  cuda_test_bitset();
+}*/
+
+#define CUDA_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat )                                \
+  TEST_F( cuda, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) {   \
+    for (int i=0; i<repeat; ++i)                                                                                \
+      cuda_test_insert_##name(num_nodes,num_inserts,num_duplicates);                                            \
+  }
+
+#define CUDA_FAILED_INSERT_TEST( num_nodes, repeat )                           \
+  TEST_F( cuda, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      cuda_test_failed_insert(num_nodes);                                      \
+  }
+
+#define CUDA_ASSIGNEMENT_TEST( num_nodes, repeat )                               \
+  TEST_F( cuda, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) {  \
+    for (int i=0; i<repeat; ++i)                                                 \
+      cuda_test_assignment_operators(num_nodes);                                 \
+  }
+
+#define CUDA_DEEP_COPY( num_nodes, repeat )                             \
+  TEST_F( cuda, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      cuda_test_deep_copy(num_nodes);                     \
+  }
+
+#define CUDA_VECTOR_COMBINE_TEST( size )                             \
+  TEST_F( cuda, vector_combination##size##x) {       \
+      cuda_test_vector_combinations(size);                     \
+  }
+
+#define CUDA_DUALVIEW_COMBINE_TEST( size )                             \
+  TEST_F( cuda, dualview_combination##size##x) {       \
+      cuda_test_dualview_combinations(size);                     \
+  }
+
+#define CUDA_SCATTERVIEW_TEST( size )             \
+  TEST_F( cuda, scatterview_##size##x) {                      \
+    test_scatter_view<Kokkos::Cuda>(size);               \
+  }
+
+CUDA_DUALVIEW_COMBINE_TEST( 10 )
+CUDA_VECTOR_COMBINE_TEST( 10 )
+CUDA_VECTOR_COMBINE_TEST( 3057 )
+
+CUDA_SCATTERVIEW_TEST( 10 )
+
+CUDA_SCATTERVIEW_TEST( 1000000 )
+
+CUDA_INSERT_TEST(close,               100000, 90000, 100, 500)
+CUDA_INSERT_TEST(far,                 100000, 90000, 100, 500)
+CUDA_DEEP_COPY( 10000, 1 )
+CUDA_FAILED_INSERT_TEST( 10000, 1000 )
+
+
+#undef CUDA_INSERT_TEST
+#undef CUDA_FAILED_INSERT_TEST
+#undef CUDA_ASSIGNEMENT_TEST
+#undef CUDA_DEEP_COPY
+#undef CUDA_VECTOR_COMBINE_TEST
+#undef CUDA_DUALVIEW_COMBINE_TEST
+
+
+TEST_F( cuda , dynamic_view )
+{
+  typedef TestDynamicView< double , Kokkos::CudaUVMSpace >
+    TestDynView ;
+
+  for ( int i = 0 ; i < 10 ; ++i ) {
+    TestDynView::run( 100000 + 100 * i );
+  }
+}
+
+
+#if defined(KOKKOS_CLASS_LAMBDA)
+TEST_F(cuda, ErrorReporterViaLambda)
+{
+  TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Cuda>>();
+}
+#endif
+
+TEST_F(cuda, ErrorReporter)
+{
+  TestErrorReporter<ErrorReporterDriver<Kokkos::Cuda>>();
+}
+
+}
+
+#else
+void KOKKOS_CONTAINERS_UNIT_TESTS_TESTCUDA_PREVENT_EMPTY_LINK_ERROR() {}
+#endif  /* #ifdef KOKKOS_ENABLE_CUDA */
+
diff --git a/packages/kokkos/containers/unit_tests/TestDualView.hpp b/packages/kokkos/containers/unit_tests/TestDualView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..613a096b65f1853cff3bdcf6c49f90d3ae33e99d
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestDualView.hpp
@@ -0,0 +1,122 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_DUALVIEW_HPP
+#define KOKKOS_TEST_DUALVIEW_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <impl/Kokkos_Timer.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+  template <typename Scalar, class Device>
+  struct test_dualview_combinations
+  {
+    typedef test_dualview_combinations<Scalar,Device> self_type;
+
+    typedef Scalar scalar_type;
+    typedef Device execution_space;
+
+    Scalar reference;
+    Scalar result;
+
+    template <typename ViewType>
+    Scalar run_me(unsigned int n,unsigned int m){
+      if(n<10) n = 10;
+      if(m<3) m = 3;
+      ViewType a("A",n,m);
+
+      Kokkos::deep_copy( a.d_view , 1 );
+
+      a.template modify<typename ViewType::execution_space>();
+      a.template sync<typename ViewType::host_mirror_space>();
+
+      a.h_view(5,1) = 3;
+      a.h_view(6,1) = 4;
+      a.h_view(7,2) = 5;
+      a.template modify<typename ViewType::host_mirror_space>();
+      ViewType b = Kokkos::subview(a,std::pair<unsigned int, unsigned int>(6,9),std::pair<unsigned int, unsigned int>(0,1));
+      a.template sync<typename ViewType::execution_space>();
+      b.template modify<typename ViewType::execution_space>();
+
+      Kokkos::deep_copy( b.d_view , 2 );
+
+      a.template sync<typename ViewType::host_mirror_space>();
+      Scalar count = 0;
+      for(unsigned int i = 0; i<a.d_view.extent(0); i++)
+        for(unsigned int j = 0; j<a.d_view.extent(1); j++)
+          count += a.h_view(i,j);
+      return count -  a.d_view.extent(0)*a.d_view.extent(1)-2-4-3*2;
+    }
+
+
+    test_dualview_combinations(unsigned int size)
+    {
+      result = run_me< Kokkos::DualView<Scalar**,Kokkos::LayoutLeft,Device> >(size,3);
+    }
+
+   };
+
+} // namespace Impl
+
+
+
+
+template <typename Scalar, typename Device>
+void test_dualview_combinations(unsigned int size)
+{
+  Impl::test_dualview_combinations<Scalar,Device> test(size);
+  ASSERT_EQ( test.result,0);
+
+}
+
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
+
diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e0a89a0abd1c47d6c669b6a9645ba51b90cb5831
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@@ -0,0 +1,1559 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+/*--------------------------------------------------------------------------*/
+
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class T , class ... P >
+size_t allocation_count( const Kokkos::DynRankView<T,P...> & view )
+{
+  const size_t card  = view.size();
+  const size_t alloc = view.span();
+
+  return card <= alloc ? alloc : 0 ;
+}
+
+/*--------------------------------------------------------------------------*/
+
+template< typename T, class DeviceType>
+struct TestViewOperator
+{
+  typedef DeviceType  execution_space ;
+
+  static const unsigned N = 100 ;
+  static const unsigned D = 3 ;
+
+  typedef Kokkos::DynRankView< T , execution_space > view_type ;
+
+  const view_type v1 ;
+  const view_type v2 ;
+
+  TestViewOperator()
+    : v1( "v1" , N , D )
+    , v2( "v2" , N , D )
+    {}
+
+  static void testit()
+  {
+    Kokkos::parallel_for( N , TestViewOperator() );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned i ) const
+  {
+    const unsigned X = 0 ;
+    const unsigned Y = 1 ;
+    const unsigned Z = 2 ;
+
+    v2(i,X) = v1(i,X);
+    v2(i,Y) = v1(i,Y);
+    v2(i,Z) = v1(i,Z);
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class DataType ,
+          class DeviceType ,
+          unsigned Rank >
+struct TestViewOperator_LeftAndRight ;
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 7 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, unsigned N5, unsigned N6 )
+    : left(  "left" , N0, N1, N2, N3, N4, N5, N6 )
+    , right( "right" , N0, N1, N2, N3, N4, N5, N6 )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, unsigned N5, unsigned N6 )
+  {
+    TestViewOperator_LeftAndRight driver(N0, N1, N2, N3, N4, N5, N6 );
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i6 = 0 ; i6 < unsigned(left.extent(6)) ; ++i6 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(left.extent(5)) ; ++i5 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(left.extent(4)) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.extent(3)) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.extent(2)) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.extent(1)) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.extent(0)) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5, i6 ) -
+                     & left(  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.extent(0)) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.extent(1)) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.extent(2)) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.extent(3)) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(right.extent(4)) ; ++i4 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(right.extent(5)) ; ++i5 )
+    for ( unsigned i6 = 0 ; i6 < unsigned(right.extent(6)) ; ++i6 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5, i6 ) -
+                     & right(  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 6 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, unsigned N5 )
+    : left(  "left" , N0, N1, N2, N3, N4, N5 )
+    , right( "right" , N0, N1, N2, N3, N4, N5 )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, unsigned N5)
+  {
+    TestViewOperator_LeftAndRight driver (N0, N1, N2, N3, N4, N5);
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i5 = 0 ; i5 < unsigned(left.extent(5)) ; ++i5 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(left.extent(4)) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.extent(3)) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.extent(2)) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.extent(1)) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.extent(0)) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5 ) -
+                     & left(  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.extent(0)) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.extent(1)) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.extent(2)) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.extent(3)) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(right.extent(4)) ; ++i4 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(right.extent(5)) ; ++i5 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5 ) -
+                     & right(  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 5 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4 )
+    : left(  "left" , N0, N1, N2, N3, N4 )
+    , right( "right" , N0, N1, N2, N3, N4 )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4)
+  {
+    TestViewOperator_LeftAndRight driver(N0, N1, N2, N3, N4);
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i4 = 0 ; i4 < unsigned(left.extent(4)) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.extent(3)) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.extent(2)) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.extent(1)) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.extent(0)) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4 ) -
+                     & left(  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+
+      if ( & left( i0, i1, i2, i3, i4 ) !=
+           & left_stride( i0, i1, i2, i3, i4 ) ) { update |= 4 ; }
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.extent(0)) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.extent(1)) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.extent(2)) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.extent(3)) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(right.extent(4)) ; ++i4 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4 ) -
+                     & right(  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+
+      if ( & right( i0, i1, i2, i3, i4 ) !=
+           & right_stride( i0, i1, i2, i3, i4 ) ) { update |= 8 ; }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 4 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, unsigned N3)
+    : left(  "left" , N0, N1, N2, N3 )
+    , right( "right" , N0, N1, N2, N3 )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3)
+  {
+    TestViewOperator_LeftAndRight driver (N0, N1, N2, N3);
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.extent(3)) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.extent(2)) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.extent(1)) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.extent(0)) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3 ) -
+                     & left(  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.extent(0)) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.extent(1)) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.extent(2)) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.extent(3)) ; ++i3 )
+    {
+      const long j = & right( i0, i1, i2, i3 ) -
+                     & right(  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 3 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2)
+    : left(  std::string("left") , N0, N1, N2 )
+    , right( std::string("right") , N0, N1, N2 )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1, unsigned N2)
+  {
+    TestViewOperator_LeftAndRight driver (N0, N1, N2);
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.extent(2)) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.extent(1)) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.extent(0)) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2 ) -
+                     & left(  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+
+      if ( & left(i0,i1,i2) != & left_stride(i0,i1,i2) ) { update |= 4 ; }
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.extent(0)) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.extent(1)) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.extent(2)) ; ++i2 )
+    {
+      const long j = & right( i0, i1, i2 ) -
+                     & right(  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+
+      if ( & right(i0,i1,i2) != & right_stride(i0,i1,i2) ) { update |= 8 ; }
+    }
+
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.extent(0)) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.extent(1)) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.extent(2)) ; ++i2 )
+    {
+      if ( & left(i0,i1,i2)  != & left(i0,i1,i2,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0,i1,i2) != & right(i0,i1,i2,0,0,0,0) ) { update |= 3 ; }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 2 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1)
+    : left(  "left" , N0, N1 )
+    , right( "right" , N0, N1 )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1)
+  {
+    TestViewOperator_LeftAndRight driver(N0, N1);
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.extent(1)) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.extent(0)) ; ++i0 )
+    {
+      const long j = & left( i0, i1 ) -
+                     & left(  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.extent(0)) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.extent(1)) ; ++i1 )
+    {
+      const long j = & right( i0, i1 ) -
+                     & right(  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.extent(0)) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.extent(1)) ; ++i1 )
+    {
+      if ( & left(i0,i1)  != & left(i0,i1,0,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0,i1) != & right(i0,i1,0,0,0,0,0) ) { update |= 3 ; }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    DynRankView< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0)
+    : left(  "left" , N0 )
+    , right( "right" , N0 )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0)
+  {
+    TestViewOperator_LeftAndRight driver (N0) ;
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.extent(0)) ; ++i0 )
+    {
+      if ( & left(i0)  != & left(i0,0,0,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0) != & right(i0,0,0,0,0,0,0) ) { update |= 3 ; }
+      if ( & left(i0)  != & left_stride(i0) ) { update |= 4 ; }
+      if ( & right(i0) != & right_stride(i0) ) { update |= 8 ; }
+    }
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< typename T, class DeviceType >
+class TestDynViewAPI
+{
+public:
+  typedef DeviceType        device ;
+
+  enum { N0 = 1000 ,
+         N1 = 3 ,
+         N2 = 5 ,
+         N3 = 7 };
+
+  typedef Kokkos::DynRankView< T , device > dView0 ;
+  typedef Kokkos::DynRankView< const T , device > const_dView0 ;
+
+  typedef Kokkos::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ;
+  typedef typename dView0::host_mirror_space host_drv_space ;
+
+  typedef Kokkos::View< T , device >        View0 ;
+  typedef Kokkos::View< T* , device >       View1 ;
+  typedef Kokkos::View< T******* , device > View7 ;
+
+  typedef typename View0::host_mirror_space  host_view_space ;
+
+  TestDynViewAPI()
+  {
+    run_test_resize_realloc();
+    run_test_mirror();
+    run_test_scalar();
+    run_test();
+    run_test_const();
+    run_test_subview();
+    run_test_subview_strided();
+    run_test_vector();
+
+    TestViewOperator< T , device >::testit();
+    TestViewOperator_LeftAndRight< int , device , 7 >::testit(2,3,4,2,3,4,2); 
+    TestViewOperator_LeftAndRight< int , device , 6 >::testit(2,3,4,2,3,4); 
+    TestViewOperator_LeftAndRight< int , device , 5 >::testit(2,3,4,2,3);
+    TestViewOperator_LeftAndRight< int , device , 4 >::testit(2,3,4,2);
+    TestViewOperator_LeftAndRight< int , device , 3 >::testit(2,3,4);
+    TestViewOperator_LeftAndRight< int , device , 2 >::testit(2,3);
+    TestViewOperator_LeftAndRight< int , device , 1 >::testit(2);
+  }
+
+  static void run_test_resize_realloc()
+  {
+    dView0 drv0("drv0", 10, 20, 30);
+    ASSERT_EQ( drv0.rank(), 3);
+
+    Kokkos::resize(drv0, 5, 10);
+    ASSERT_EQ( drv0.rank(), 2);
+    ASSERT_EQ( drv0.extent(0), 5);
+    ASSERT_EQ( drv0.extent(1), 10);
+    ASSERT_EQ( drv0.extent(2), 1);
+
+    Kokkos::realloc(drv0, 10, 20);
+    ASSERT_EQ( drv0.rank(), 2);
+    ASSERT_EQ( drv0.extent(0), 10);
+    ASSERT_EQ( drv0.extent(1), 20);
+    ASSERT_EQ( drv0.extent(2), 1);
+
+  }
+
+  static void run_test_mirror()
+  {
+    typedef Kokkos::DynRankView< int , host_drv_space > view_type ;
+    typedef typename view_type::HostMirror mirror_type ;
+    view_type a("a");
+    mirror_type am = Kokkos::create_mirror_view(a);
+    mirror_type ax = Kokkos::create_mirror(a);
+    ASSERT_EQ( & a() , & am() );
+    ASSERT_EQ( a.rank() , am.rank() );
+    ASSERT_EQ( ax.rank() , am.rank() );
+
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
+
+      int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
+      int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
+      int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
+
+      ASSERT_EQ(equal_ptr_h_h2,0);
+      ASSERT_EQ(equal_ptr_h_d ,0);
+      ASSERT_EQ(equal_ptr_h2_d,0);
+  
+      ASSERT_EQ(a_h.extent(0),a_h2.extent(0));
+      ASSERT_EQ(a_h.extent(0),a_d .extent(0));
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
+
+      int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
+      int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
+      int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
+
+      ASSERT_EQ(equal_ptr_h_h2,0);
+      ASSERT_EQ(equal_ptr_h_d ,0);
+      ASSERT_EQ(equal_ptr_h2_d,0);
+  
+      ASSERT_EQ(a_h.extent(0),a_h2.extent(0));
+      ASSERT_EQ(a_h.extent(0),a_d .extent(0));
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+
+      ASSERT_EQ(a_h.extent(0),a_h2.extent(0));
+      ASSERT_EQ(a_h.extent(0),a_d .extent(0));
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+
+  
+      ASSERT_EQ(a_h.extent(0),a_h2.extent(0));
+      ASSERT_EQ(a_h.extent(0),a_d .extent(0));
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      typedef Kokkos::DynRankView< int , Kokkos::LayoutStride , Kokkos::HostSpace > view_stride_type ;
+      unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
+      view_stride_type a_h( "a" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+
+      ASSERT_EQ(a_h.extent(0),a_h2.extent(0));
+      ASSERT_EQ(a_h.extent(0),a_d .extent(0));
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+  }
+
+  static void run_test_scalar()
+  {
+    typedef typename dView0::HostMirror  hView0 ; //HostMirror of DynRankView is a DynRankView
+
+    dView0 dx , dy ;
+    hView0 hx , hy ;
+
+    dx = dView0( "dx" );
+    dy = dView0( "dy" );
+
+    hx = Kokkos::create_mirror( dx );
+    hy = Kokkos::create_mirror( dy );
+
+    hx() = 1 ;
+
+    Kokkos::deep_copy( dx , hx );
+    Kokkos::deep_copy( dy , dx );
+    Kokkos::deep_copy( hy , dy );
+
+    ASSERT_EQ( hx(), hy() );
+    ASSERT_EQ( dx.rank() , hx.rank() );
+    ASSERT_EQ( dy.rank() , hy.rank() );
+
+  //View - DynRankView Interoperability tests
+  // deep_copy DynRankView to View
+    View0 vx("vx");
+    Kokkos::deep_copy( vx , dx );
+    ASSERT_EQ( rank(dx) , rank(vx) );
+
+    View0 vy("vy");
+    Kokkos::deep_copy( vy , dy );
+    ASSERT_EQ( rank(dy) , rank(vy) );
+
+  // deep_copy View to DynRankView 
+    dView0 dxx("dxx");
+    Kokkos::deep_copy( dxx , vx );
+    ASSERT_EQ( rank(dxx) , rank(vx) );
+
+
+    View7 vcast = dx.ConstDownCast();
+    ASSERT_EQ( dx.extent(0) , vcast.extent(0) );
+    ASSERT_EQ( dx.extent(1) , vcast.extent(1) );
+    ASSERT_EQ( dx.extent(2) , vcast.extent(2) );
+    ASSERT_EQ( dx.extent(3) , vcast.extent(3) );
+    ASSERT_EQ( dx.extent(4) , vcast.extent(4) );
+
+    View7 vcast1( dy.ConstDownCast() );
+    ASSERT_EQ( dy.extent(0) , vcast1.extent(0) );
+    ASSERT_EQ( dy.extent(1) , vcast1.extent(1) );
+    ASSERT_EQ( dy.extent(2) , vcast1.extent(2) );
+    ASSERT_EQ( dy.extent(3) , vcast1.extent(3) );
+    ASSERT_EQ( dy.extent(4) , vcast1.extent(4) );
+
+  //View - DynRankView Interoperability tests
+  // copy View to DynRankView
+    dView0 dfromvx( vx );
+    auto hmx = Kokkos::create_mirror_view(dfromvx) ;
+    Kokkos::deep_copy(hmx , dfromvx);
+    auto hvx = Kokkos::create_mirror_view(vx) ;
+    Kokkos::deep_copy(hvx , vx);
+    ASSERT_EQ( rank(hvx) , rank(hmx) );
+    ASSERT_EQ( hvx.extent(0) , hmx.extent(0) );
+    ASSERT_EQ( hvx.extent(1) , hmx.extent(1) );
+
+  // copy-assign View to DynRankView
+    dView0 dfromvy = vy ;
+    auto hmy = Kokkos::create_mirror_view(dfromvy) ;
+    Kokkos::deep_copy(hmy , dfromvy);
+    auto hvy = Kokkos::create_mirror_view(vy) ;
+    Kokkos::deep_copy(hvy , vy);
+    ASSERT_EQ( rank(hvy) , rank(hmy) );
+    ASSERT_EQ( hvy.extent(0) , hmy.extent(0) );
+    ASSERT_EQ( hvy.extent(1) , hmy.extent(1) );
+
+
+    View7 vtest1("vtest1",2,2,2,2,2,2,2);
+    dView0 dfromv1( vtest1 );
+    ASSERT_EQ( dfromv1.rank() , vtest1.Rank );
+    ASSERT_EQ( dfromv1.extent(0) , vtest1.extent(0) );
+    ASSERT_EQ( dfromv1.extent(1) , vtest1.extent(1) );
+    ASSERT_EQ( dfromv1.use_count() , vtest1.use_count() );
+
+    dView0 dfromv2( vcast );
+    ASSERT_EQ( dfromv2.rank() , vcast.Rank );
+    ASSERT_EQ( dfromv2.extent(0) , vcast.extent(0) );
+    ASSERT_EQ( dfromv2.extent(1) , vcast.extent(1) );
+    ASSERT_EQ( dfromv2.use_count() , vcast.use_count() );
+
+    dView0 dfromv3 = vcast1;
+    ASSERT_EQ( dfromv3.rank() , vcast1.Rank );
+    ASSERT_EQ( dfromv3.extent(0) , vcast1.extent(0) );
+    ASSERT_EQ( dfromv3.extent(1) , vcast1.extent(1) );
+    ASSERT_EQ( dfromv3.use_count() , vcast1.use_count() );
+  }
+
+  static void run_test()
+  {
+    // mfh 14 Feb 2014: This test doesn't actually create instances of
+    // these types.  In order to avoid "declared but unused typedef"
+    // warnings, we declare empty instances of these types, with the
+    // usual "(void)" marker to avoid compiler warnings for unused
+    // variables.
+
+    typedef typename dView0::HostMirror  hView0 ;
+
+    {
+      hView0 thing;
+      (void) thing;
+    }
+
+    dView0 d_uninitialized(Kokkos::ViewAllocateWithoutInitializing("uninit"),10,20);
+    ASSERT_TRUE( d_uninitialized.data() != nullptr );
+    ASSERT_EQ( d_uninitialized.rank() , 2 );
+    ASSERT_EQ( d_uninitialized.extent(0) , 10 );
+    ASSERT_EQ( d_uninitialized.extent(1) , 20 );
+    ASSERT_EQ( d_uninitialized.extent(2) , 1  );
+
+    dView0 dx , dy , dz ;
+    hView0 hx , hy , hz ;
+
+    ASSERT_TRUE( Kokkos::is_dyn_rank_view<dView0>::value );
+    ASSERT_FALSE( Kokkos::is_dyn_rank_view< Kokkos::View<double> >::value );
+
+    ASSERT_TRUE( dx.ptr_on_device() == 0 ); //Okay with UVM
+    ASSERT_TRUE( dy.ptr_on_device() == 0 );  //Okay with UVM
+    ASSERT_TRUE( dz.ptr_on_device() == 0 ); //Okay with UVM
+    ASSERT_TRUE( hx.ptr_on_device() == 0 );
+    ASSERT_TRUE( hy.ptr_on_device() == 0 );
+    ASSERT_TRUE( hz.ptr_on_device() == 0 );
+    ASSERT_EQ( dx.extent(0) , 0u ); //Okay with UVM
+    ASSERT_EQ( dy.extent(0) , 0u ); //Okay with UVM
+    ASSERT_EQ( dz.extent(0) , 0u ); //Okay with UVM
+    ASSERT_EQ( hx.extent(0) , 0u );
+    ASSERT_EQ( hy.extent(0) , 0u );
+    ASSERT_EQ( hz.extent(0) , 0u );
+    ASSERT_EQ( dx.rank() , 0u ); //Okay with UVM
+    ASSERT_EQ( hx.rank() , 0u );
+
+    dx = dView0( "dx" , N1 , N2 , N3 );
+    dy = dView0( "dy" , N1 , N2 , N3 );
+
+    hx = hView0( "hx" , N1 , N2 , N3 );
+    hy = hView0( "hy" , N1 , N2 , N3 );
+
+    ASSERT_EQ( dx.extent(0) , unsigned(N1) ); //Okay with UVM
+    ASSERT_EQ( dy.extent(0) , unsigned(N1) ); //Okay with UVM
+    ASSERT_EQ( hx.extent(0) , unsigned(N1) );
+    ASSERT_EQ( hy.extent(0) , unsigned(N1) );
+    ASSERT_EQ( dx.rank() , 3 ); //Okay with UVM
+    ASSERT_EQ( hx.rank() , 3 );
+
+    dx = dView0( "dx" , N0 , N1 , N2 , N3 );
+    dy = dView0( "dy" , N0 , N1 , N2 , N3 );
+    hx = hView0( "hx" , N0 , N1 , N2 , N3 );
+    hy = hView0( "hy" , N0 , N1 , N2 , N3 );
+
+    ASSERT_EQ( dx.extent(0) , unsigned(N0) );
+    ASSERT_EQ( dy.extent(0) , unsigned(N0) );
+    ASSERT_EQ( hx.extent(0) , unsigned(N0) );
+    ASSERT_EQ( hy.extent(0) , unsigned(N0) );
+    ASSERT_EQ( dx.rank() , 4 );
+    ASSERT_EQ( dy.rank() , 4 );
+    ASSERT_EQ( hx.rank() , 4 );
+    ASSERT_EQ( hy.rank() , 4 );
+
+    ASSERT_EQ( dx.use_count() , size_t(1) );
+
+    dView0_unmanaged unmanaged_dx = dx;
+    ASSERT_EQ( dx.use_count() , size_t(1) );
+
+
+    dView0_unmanaged unmanaged_from_ptr_dx = dView0_unmanaged(dx.ptr_on_device(),
+                                                              dx.extent(0),
+                                                              dx.extent(1),
+                                                              dx.extent(2),
+                                                              dx.extent(3));
+
+
+    {
+      // Destruction of this view should be harmless
+      const_dView0 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
+                                                dx.extent(0) ,
+                                                dx.extent(1) ,
+                                                dx.extent(2) ,
+                                                dx.extent(3) );
+    }
+
+    const_dView0 const_dx = dx ;
+    ASSERT_EQ( dx.use_count() , size_t(2) );
+
+    {
+      const_dView0 const_dx2;
+      const_dx2 = const_dx;
+      ASSERT_EQ( dx.use_count() , size_t(3) );
+
+      const_dx2 = dy;
+      ASSERT_EQ( dx.use_count() , size_t(2) );
+
+      const_dView0 const_dx3(dx);
+      ASSERT_EQ( dx.use_count() , size_t(3) );
+      
+      dView0_unmanaged dx4_unmanaged(dx);
+      ASSERT_EQ( dx.use_count() , size_t(3) );
+    }
+
+    ASSERT_EQ( dx.use_count() , size_t(2) );
+
+
+    ASSERT_FALSE( dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( const_dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( unmanaged_dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( unmanaged_from_ptr_dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( dy.ptr_on_device() == 0 );
+    ASSERT_NE( dx , dy );
+
+    ASSERT_EQ( dx.extent(0) , unsigned(N0) );
+    ASSERT_EQ( dx.extent(1) , unsigned(N1) );
+    ASSERT_EQ( dx.extent(2) , unsigned(N2) );
+    ASSERT_EQ( dx.extent(3) , unsigned(N3) );
+
+    ASSERT_EQ( dy.extent(0) , unsigned(N0) );
+    ASSERT_EQ( dy.extent(1) , unsigned(N1) );
+    ASSERT_EQ( dy.extent(2) , unsigned(N2) );
+    ASSERT_EQ( dy.extent(3) , unsigned(N3) );
+
+    ASSERT_EQ( unmanaged_from_ptr_dx.capacity(),unsigned(N0)*unsigned(N1)*unsigned(N2)*unsigned(N3) );
+
+    hx = Kokkos::create_mirror( dx );
+    hy = Kokkos::create_mirror( dy );
+
+    ASSERT_EQ( hx.rank() , dx.rank() );
+    ASSERT_EQ( hy.rank() , dy.rank() );
+
+    ASSERT_EQ( hx.extent(0) , unsigned(N0) );
+    ASSERT_EQ( hx.extent(1) , unsigned(N1) );
+    ASSERT_EQ( hx.extent(2) , unsigned(N2) );
+    ASSERT_EQ( hx.extent(3) , unsigned(N3) );
+
+    ASSERT_EQ( hy.extent(0) , unsigned(N0) );
+    ASSERT_EQ( hy.extent(1) , unsigned(N1) );
+    ASSERT_EQ( hy.extent(2) , unsigned(N2) );
+    ASSERT_EQ( hy.extent(3) , unsigned(N3) );
+
+    // T v1 = hx() ;    // Generates compile error as intended
+    // T v2 = hx(0,0) ; // Generates compile error as intended
+    // hx(0,0) = v2 ;   // Generates compile error as intended
+
+#if 0 /* Asynchronous deep copies not implemented for dynamic rank view */
+    // Testing with asynchronous deep copy with respect to device
+    {
+      size_t count = 0 ;
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < hx.extent(1) ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < hx.extent(2) ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < hx.extent(3) ; ++i3 ) {
+        hx(ip,i1,i2,i3) = ++count ;
+      }}}}
+
+
+      Kokkos::deep_copy(typename hView0::execution_space(), dx , hx );
+      Kokkos::deep_copy(typename hView0::execution_space(), dy , dx );
+      Kokkos::deep_copy(typename hView0::execution_space(), hy , dy );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
+      }}}}
+
+      Kokkos::deep_copy(typename hView0::execution_space(), dx , T(0) );
+      Kokkos::deep_copy(typename hView0::execution_space(), hx , dx );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
+      }}}}
+    }
+
+    // Testing with asynchronous deep copy with respect to host
+    {
+      size_t count = 0 ;
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < hx.extent(1) ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < hx.extent(2) ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < hx.extent(3) ; ++i3 ) {
+        hx(ip,i1,i2,i3) = ++count ;
+      }}}}
+
+      Kokkos::deep_copy(typename dView0::execution_space(), dx , hx );
+      Kokkos::deep_copy(typename dView0::execution_space(), dy , dx );
+      Kokkos::deep_copy(typename dView0::execution_space(), hy , dy );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
+      }}}}
+
+      Kokkos::deep_copy(typename dView0::execution_space(), dx , T(0) );
+      Kokkos::deep_copy(typename dView0::execution_space(), hx , dx );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
+      }}}}
+    }
+#endif
+
+    // Testing with synchronous deep copy
+    {
+      size_t count = 0 ;
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < hx.extent(1) ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < hx.extent(2) ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < hx.extent(3) ; ++i3 ) {
+        hx(ip,i1,i2,i3) = ++count ;
+      }}}}
+
+      Kokkos::deep_copy( dx , hx );
+      Kokkos::deep_copy( dy , dx );
+      Kokkos::deep_copy( hy , dy );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
+      }}}}
+
+      Kokkos::deep_copy( dx , T(0) );
+      Kokkos::deep_copy( hx , dx );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
+      }}}}
+//    ASSERT_EQ( hx(0,0,0,0,0,0,0,0) , T(0) ); //Test rank8 op behaves properly - if implemented
+    }
+
+    dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
+    dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);
+
+    dx = dView0();
+    ASSERT_TRUE( dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( dy.ptr_on_device() == 0 );
+    ASSERT_FALSE( dz.ptr_on_device() == 0 );
+    dy = dView0();
+    ASSERT_TRUE( dx.ptr_on_device() == 0 );
+    ASSERT_TRUE( dy.ptr_on_device() == 0 );
+    ASSERT_FALSE( dz.ptr_on_device() == 0 );
+    dz = dView0();
+    ASSERT_TRUE( dx.ptr_on_device() == 0 );
+    ASSERT_TRUE( dy.ptr_on_device() == 0 );
+    ASSERT_TRUE( dz.ptr_on_device() == 0 );
+
+  //View - DynRankView Interoperability tests
+    // deep_copy from view to dynrankview
+    const int testdim = 4;
+    dView0 dxx("dxx",testdim);
+    View1  vxx("vxx",testdim);
+    auto hvxx = Kokkos::create_mirror_view(vxx); 
+    for (int i = 0; i < testdim; ++i)
+      { hvxx(i) = i; }
+    Kokkos::deep_copy(vxx,hvxx);
+    Kokkos::deep_copy(dxx,vxx);
+    auto hdxx = Kokkos::create_mirror_view(dxx);
+    Kokkos::deep_copy(hdxx,dxx);
+    for (int i = 0; i < testdim; ++i)
+      { ASSERT_EQ( hvxx(i) , hdxx(i) ); }
+
+    ASSERT_EQ( rank(hdxx) , rank(hvxx) );
+    ASSERT_EQ( hdxx.extent(0) , testdim );
+    ASSERT_EQ( hdxx.extent(0) , hvxx.extent(0) );
+
+    // deep_copy from dynrankview to view
+    View1 vdxx("vdxx",testdim);
+    auto hvdxx = Kokkos::create_mirror_view(vdxx);
+    Kokkos::deep_copy(hvdxx , hdxx);
+    ASSERT_EQ( rank(hdxx) , rank(hvdxx) );
+    ASSERT_EQ( hvdxx.extent(0) , testdim );
+    ASSERT_EQ( hdxx.extent(0) , hvdxx.extent(0) );
+    for (int i = 0; i < testdim; ++i)
+      { ASSERT_EQ( hvxx(i) , hvdxx(i) ); }
+  }
+
+  typedef T DataType ;
+
+  static void
+  check_auto_conversion_to_const(
+     const Kokkos::DynRankView< const DataType , device > & arg_const ,
+     const Kokkos::DynRankView< DataType , device > & arg )
+  {
+    ASSERT_TRUE( arg_const == arg );
+  }
+
+  static void run_test_const()
+  {
+    typedef Kokkos::DynRankView< DataType , device > typeX ;
+    typedef Kokkos::DynRankView< const DataType , device > const_typeX ;
+    typedef Kokkos::DynRankView< const DataType , device , Kokkos::MemoryRandomAccess > const_typeR ;
+    typeX x( "X", 2 );
+    const_typeX xc = x ;
+    const_typeR xr = x ;
+
+    ASSERT_TRUE( xc == x );
+    ASSERT_TRUE( x == xc );
+
+    // For CUDA the constant random access View does not return
+    // an lvalue reference due to retrieving through texture cache
+    // therefore not allowed to query the underlying pointer.
+#if defined(KOKKOS_ENABLE_CUDA)
+    if ( ! std::is_same< typename device::execution_space , Kokkos::Cuda >::value )
+#endif
+    {
+      ASSERT_TRUE( x.ptr_on_device() == xr.ptr_on_device() );
+    }
+
+    // typeX xf = xc ; // setting non-const from const must not compile
+
+    check_auto_conversion_to_const( x , x );
+  }
+
+
+  static void run_test_subview()
+  {
+    typedef Kokkos::DynRankView< const T , device > cdView ;
+    typedef Kokkos::DynRankView< T , device > dView ;
+  // LayoutStride required for all returned DynRankView subdynrankview's
+    typedef Kokkos::DynRankView< T , Kokkos::LayoutStride , device > sdView ; 
+
+    dView0 d0( "d0" );
+    cdView s0 = d0 ;
+
+  //  N0 = 1000,N1 = 3,N2 = 5,N3 = 7 
+    unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
+    sdView d7( "d7" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
+    ASSERT_EQ( d7.rank() , 7 );
+
+    sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); 
+    ASSERT_EQ( ds0.rank() , 0 );
+
+//Basic test - ALL
+    sdView dsALL = Kokkos::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); 
+    ASSERT_EQ( dsALL.rank() , 7 );
+
+//  Send a value to final rank returning rank 6 subview
+    sdView dsm1 = Kokkos::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 1 );
+    ASSERT_EQ( dsm1.rank() , 6 );
+
+//  Send a std::pair as argument to a rank
+    sdView dssp = Kokkos::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , std::pair<unsigned,unsigned>(1,2) );
+    ASSERT_EQ( dssp.rank() , 7 );
+
+//  Send a kokkos::pair as argument to a rank; take default layout as input
+    dView0 dd0("dd0" , N0 , N1 , N2 , 2 , 2 , 2 , 2 ); //default layout
+    ASSERT_EQ( dd0.rank() , 7 );
+    sdView dtkp = Kokkos::subdynrankview( dd0 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( dtkp.rank() , 7 );
+
+// Return rank 7 subview, taking a pair as one argument, layout stride input
+    sdView ds7 = Kokkos::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( ds7.rank() , 7 );
+
+// Default Layout DynRankView
+    dView dv6("dv6" , N0 , N1 , N2 , N3 , 2 , 2 );
+    ASSERT_EQ( dv6.rank() , 6 );
+
+// DynRankView with LayoutRight
+    typedef Kokkos::DynRankView< T , Kokkos::LayoutRight , device > drView ;
+    drView dr5( "dr5" , N0 , N1 , N2 , 2 , 2 );
+    ASSERT_EQ( dr5.rank() , 5 );
+
+// LayoutStride but arranged as LayoutRight
+  // NOTE: unused arg_layout dimensions must be set to ~size_t(0) so that 
+  //  rank deduction can properly take place
+    unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
+    Kokkos::LayoutStride ls = Kokkos::LayoutStride::order_dimensions(5, order5, dimen5);
+    ls.dimension[5] = ~size_t(0);
+    ls.dimension[6] = ~size_t(0);
+    ls.dimension[7] = ~size_t(0);
+    sdView d5("d5", ls);
+    ASSERT_EQ( d5.rank() , 5 );
+
+//  LayoutStride arranged as LayoutRight - commented out as example that fails unit test
+//    unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
+//    sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order5, dimen5) );
+//
+//  Fails the following unit test:
+//    ASSERT_EQ( d5.rank() , dr5.rank() );
+//
+//  Explanation: In construction of the Kokkos::LayoutStride below, since the 
+//   remaining dimensions are not specified, they will default to values of 0 
+//   rather than ~size_t(0). 
+//  When passed to the DynRankView constructor the default dimensions (of 0) 
+//   will be counted toward the dynamic rank and returning an incorrect value 
+//   (i.e. rank 7 rather than 5).
+
+// Check LayoutRight dr5 and LayoutStride d5 dimensions agree (as they should) 
+    ASSERT_EQ( d5.extent(0) , dr5.extent(0) );
+    ASSERT_EQ( d5.extent(1) , dr5.extent(1) );
+    ASSERT_EQ( d5.extent(2) , dr5.extent(2) );
+    ASSERT_EQ( d5.extent(3) , dr5.extent(3) );
+    ASSERT_EQ( d5.extent(4) , dr5.extent(4) );
+    ASSERT_EQ( d5.extent(5) , dr5.extent(5) );
+    ASSERT_EQ( d5.rank() , dr5.rank() );
+
+// Rank 5 subview of rank 5 dynamic rank view, layout stride input
+    sdView ds5 = Kokkos::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( ds5.rank() , 5 );
+
+// Pass in extra ALL arguments beyond the rank of the DynRank View.
+// This behavior is allowed - ignore the extra ALL arguments when
+//  the src.rank() < number of arguments, but be careful!
+    sdView ds5plus = Kokkos::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) , Kokkos::ALL() );
+
+    ASSERT_EQ( ds5.rank() , ds5plus.rank() );
+    ASSERT_EQ( ds5.extent(0) , ds5plus.extent(0) );
+    ASSERT_EQ( ds5.extent(4) , ds5plus.extent(4) );
+    ASSERT_EQ( ds5.extent(5) , ds5plus.extent(5) );
+
+#if ! defined( KOKKOS_ENABLE_CUDA ) || defined ( KOKKOS_ENABLE_CUDA_UVM )
+    ASSERT_EQ( & ds5(1,1,1,1,0) - & ds5plus(1,1,1,1,0) , 0 );
+    ASSERT_EQ( & ds5(1,1,1,1,0,0) - & ds5plus(1,1,1,1,0,0) , 0 );  // passing argument to rank beyond the view's rank is allowed iff it is a 0. 
+#endif
+
+// Similar test to rank 5 above, but create rank 4 subview
+// Check that the rank contracts (ds4 and ds4plus) and that subdynrankview can accept extra args (ds4plus)
+    sdView ds4 = Kokkos::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 0 );
+    sdView ds4plus = Kokkos::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 0 , Kokkos::ALL() );
+
+    ASSERT_EQ( ds4.rank() , ds4plus.rank() );
+    ASSERT_EQ( ds4.rank() , 4 );
+    ASSERT_EQ( ds4.extent(0) , ds4plus.extent(0) );
+    ASSERT_EQ( ds4.extent(4) , ds4plus.extent(4) );
+    ASSERT_EQ( ds4.extent(5) , ds4plus.extent(5) );
+  }
+
+  static void run_test_subview_strided()
+  {
+    typedef Kokkos::DynRankView < int , Kokkos::LayoutLeft , host_drv_space > drview_left ;
+    typedef Kokkos::DynRankView < int , Kokkos::LayoutRight , host_drv_space > drview_right ;
+    typedef Kokkos::DynRankView < int , Kokkos::LayoutStride , host_drv_space > drview_stride ;
+
+    drview_left  xl2( "xl2", 100 , 200 );
+    drview_right xr2( "xr2", 100 , 200 );
+    drview_stride yl1 = Kokkos::subdynrankview( xl2 , 0 , Kokkos::ALL() );
+    drview_stride yl2 = Kokkos::subdynrankview( xl2 , 1 , Kokkos::ALL() );
+    drview_stride ys1 = Kokkos::subdynrankview( xr2 , 0 , Kokkos::ALL() );
+    drview_stride ys2 = Kokkos::subdynrankview( xr2 , 1 , Kokkos::ALL() );
+    drview_stride yr1 = Kokkos::subdynrankview( xr2 , 0 , Kokkos::ALL() );
+    drview_stride yr2 = Kokkos::subdynrankview( xr2 , 1 , Kokkos::ALL() );
+
+    ASSERT_EQ( yl1.extent(0) , xl2.extent(1) );
+    ASSERT_EQ( yl2.extent(0) , xl2.extent(1) );
+
+    ASSERT_EQ( yr1.extent(0) , xr2.extent(1) );
+    ASSERT_EQ( yr2.extent(0) , xr2.extent(1) );
+
+    ASSERT_EQ( & yl1(0) - & xl2(0,0) , 0 );
+    ASSERT_EQ( & yl2(0) - & xl2(1,0) , 0 );
+    ASSERT_EQ( & yr1(0) - & xr2(0,0) , 0 );
+    ASSERT_EQ( & yr2(0) - & xr2(1,0) , 0 );
+
+
+    drview_left  xl4( "xl4", 10 , 20 , 30 , 40 );
+    drview_right xr4( "xr4", 10 , 20 , 30 , 40 );
+
+    //Replace subdynrankview with subview - test
+    drview_stride yl4 = Kokkos::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+    drview_stride yr4 = Kokkos::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+
+    ASSERT_EQ( yl4.extent(0) , xl4.extent(1) );
+    ASSERT_EQ( yl4.extent(1) , xl4.extent(3) );
+    ASSERT_EQ( yr4.extent(0) , xr4.extent(1) );
+    ASSERT_EQ( yr4.extent(1) , xr4.extent(3) );
+    ASSERT_EQ( yl4.rank() , 2);
+    ASSERT_EQ( yr4.rank() , 2);
+
+    ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
+    ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
+  }
+
+  static void run_test_vector()
+  {
+    static const unsigned Length = 1000 , Count = 8 ;
+
+    typedef typename Kokkos::DynRankView< T , Kokkos::LayoutLeft , host_drv_space > multivector_type ; 
+
+    typedef typename Kokkos::DynRankView< T , Kokkos::LayoutRight , host_drv_space > multivector_right_type ;
+
+    multivector_type mv = multivector_type( "mv" , Length , Count );
+    multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );
+
+    typedef typename Kokkos::DynRankView< T , Kokkos::LayoutStride , host_drv_space > svector_type ;
+    typedef typename Kokkos::DynRankView< T , Kokkos::LayoutStride , host_drv_space > smultivector_type ;
+    typedef typename Kokkos::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_right_type ; 
+    typedef typename Kokkos::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_type ;
+    typedef typename Kokkos::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_smultivector_type ;
+
+    svector_type v1 = Kokkos::subdynrankview( mv , Kokkos::ALL() , 0 );
+    svector_type v2 = Kokkos::subdynrankview( mv , Kokkos::ALL() , 1 );
+    svector_type v3 = Kokkos::subdynrankview( mv , Kokkos::ALL() , 2 );
+
+    svector_type rv1 = Kokkos::subdynrankview( mv_right , 0 , Kokkos::ALL() );
+    svector_type rv2 = Kokkos::subdynrankview( mv_right , 1 , Kokkos::ALL() );
+    svector_type rv3 = Kokkos::subdynrankview( mv_right , 2 , Kokkos::ALL() );
+
+    smultivector_type mv1 = Kokkos::subdynrankview( mv , std::make_pair( 1 , 998 ) ,
+                                                 std::make_pair( 2 , 5 ) );
+
+    smultivector_type mvr1 =
+      Kokkos::subdynrankview( mv_right ,
+                       std::make_pair( 1 , 998 ) ,
+                       std::make_pair( 2 , 5 ) );
+
+    const_svector_type cv1 = Kokkos::subdynrankview( mv , Kokkos::ALL(), 0 );
+    const_svector_type cv2 = Kokkos::subdynrankview( mv , Kokkos::ALL(), 1 );
+    const_svector_type cv3 = Kokkos::subdynrankview( mv , Kokkos::ALL(), 2 );
+
+    svector_type vr1 = Kokkos::subdynrankview( mv , Kokkos::ALL() , 0 );
+    svector_type vr2 = Kokkos::subdynrankview( mv , Kokkos::ALL() , 1 );
+    svector_type vr3 = Kokkos::subdynrankview( mv , Kokkos::ALL() , 2 );
+
+    const_svector_right_type cvr1 = Kokkos::subdynrankview( mv , Kokkos::ALL() , 0 );
+    const_svector_right_type cvr2 = Kokkos::subdynrankview( mv , Kokkos::ALL() , 1 );
+    const_svector_right_type cvr3 = Kokkos::subdynrankview( mv , Kokkos::ALL() , 2 );
+
+
+    ASSERT_TRUE( & v1[0] == & v1(0) );
+    ASSERT_TRUE( & v1[0] == & mv(0,0) );
+    ASSERT_TRUE( & v2[0] == & mv(0,1) );
+    ASSERT_TRUE( & v3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & cv1[0] == & mv(0,0) );
+    ASSERT_TRUE( & cv2[0] == & mv(0,1) );
+    ASSERT_TRUE( & cv3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & vr1[0] == & mv(0,0) );
+    ASSERT_TRUE( & vr2[0] == & mv(0,1) );
+    ASSERT_TRUE( & vr3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & cvr1[0] == & mv(0,0) );
+    ASSERT_TRUE( & cvr2[0] == & mv(0,1) );
+    ASSERT_TRUE( & cvr3[0] == & mv(0,2) );
+
+
+    ASSERT_TRUE( & mv1(0,0) == & mv( 1 , 2 ) );
+    ASSERT_TRUE( & mv1(1,1) == & mv( 2 , 3 ) );
+    ASSERT_TRUE( & mv1(3,2) == & mv( 4 , 4 ) );
+    ASSERT_TRUE( & mvr1(0,0) == & mv_right( 1 , 2 ) );
+    ASSERT_TRUE( & mvr1(1,1) == & mv_right( 2 , 3 ) );
+    ASSERT_TRUE( & mvr1(3,2) == & mv_right( 4 , 4 ) );
+
+    const_svector_type c_cv1( v1 );
+    typename svector_type::const_type c_cv2( v2 );
+    typename const_svector_type::const_type c_ccv2( v2 );
+
+
+    const_smultivector_type cmv( mv );
+    typename smultivector_type::const_type cmvX( cmv );
+    typename const_smultivector_type::const_type ccmvX( cmv );
+  }
+};
+
+} // namespace Test
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..852e3cc232ae193d1014e565c7b372ced50708b1
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -0,0 +1,246 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_DYNAMICVIEW_HPP
+#define KOKKOS_TEST_DYNAMICVIEW_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_DynamicView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+namespace Test {
+
+template< typename Scalar , class Space >
+struct TestDynamicView
+{
+  typedef typename Space::execution_space  execution_space ;
+  typedef typename Space::memory_space     memory_space ;
+
+  typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
+
+  typedef double value_type;
+
+  static void run( unsigned arg_total_size )
+  {
+    // Test: Create DynamicView, initialize size (via resize), run through parallel_for to set values, check values (via parallel_reduce); resize values and repeat
+    //   Case 1: min_chunk_size is a power of 2
+    {
+      view_type da("da", 1024, arg_total_size );
+      ASSERT_EQ( da.size(), 0 );
+      // Init
+      unsigned da_size = arg_total_size / 8;
+      da.resize_serial(da_size);
+      ASSERT_EQ( da.size(), da_size );
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+      Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, da_size), KOKKOS_LAMBDA ( const int i )
+          {
+          da(i) = Scalar(i);
+          }
+          );
+
+      value_type result_sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, da_size), KOKKOS_LAMBDA ( const int i, value_type& partial_sum )
+          {
+          partial_sum += (value_type)da(i);
+          }
+          , result_sum
+          );
+
+      ASSERT_EQ(result_sum, (value_type)( da_size * (da_size - 1) / 2 ) );
+#endif
+#endif
+
+      // add 3x more entries i.e. 4x larger than previous size
+      // the first 1/4 should remain the same
+      unsigned da_resize = arg_total_size / 2;
+      da.resize_serial(da_resize);
+      ASSERT_EQ( da.size(), da_resize );
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+      Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(da_size, da_resize), KOKKOS_LAMBDA ( const int i )
+          {
+          da(i) = Scalar(i);
+          }
+          );
+
+      value_type new_result_sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(da_size, da_resize), KOKKOS_LAMBDA ( const int i, value_type& partial_sum )
+          {
+          partial_sum += (value_type)da(i);
+          }
+          , new_result_sum
+          );
+
+      ASSERT_EQ(new_result_sum+result_sum, (value_type)( da_resize * (da_resize - 1) / 2 ) );
+#endif
+#endif
+    } // end scope
+
+    // Test: Create DynamicView, initialize size (via resize), run through parallel_for to set values, check values (via parallel_reduce); resize values and repeat
+    //   Case 2: min_chunk_size is NOT a power of 2
+    {
+      view_type da("da", 1023, arg_total_size );
+      ASSERT_EQ( da.size(), 0 );
+      // Init
+      unsigned da_size = arg_total_size / 8;
+      da.resize_serial(da_size);
+      ASSERT_EQ( da.size(), da_size );
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+      Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, da_size), KOKKOS_LAMBDA ( const int i )
+          {
+          da(i) = Scalar(i);
+          }
+          );
+
+      value_type result_sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, da_size), KOKKOS_LAMBDA ( const int i, value_type& partial_sum )
+          {
+          partial_sum += (value_type)da(i);
+          }
+          , result_sum
+          );
+
+      ASSERT_EQ(result_sum, (value_type)( da_size * (da_size - 1) / 2 ) );
+#endif
+#endif
+
+      // add 3x more entries i.e. 4x larger than previous size
+      // the first 1/4 should remain the same
+      unsigned da_resize = arg_total_size / 2;
+      da.resize_serial(da_resize);
+      ASSERT_EQ( da.size(), da_resize );
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+      Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(da_size, da_resize), KOKKOS_LAMBDA ( const int i )
+          {
+          da(i) = Scalar(i);
+          }
+          );
+
+      value_type new_result_sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(da_size, da_resize), KOKKOS_LAMBDA ( const int i, value_type& partial_sum )
+          {
+          partial_sum += (value_type)da(i);
+          }
+          , new_result_sum
+          );
+
+      ASSERT_EQ(new_result_sum+result_sum, (value_type)( da_resize * (da_resize - 1) / 2 ) );
+#endif
+#endif
+    } // end scope
+
+    // Test: Create DynamicView, initialize size (via resize), run through parallel_for to set values, check values (via parallel_reduce); resize values and repeat
+    //   Case 3: resize reduces the size
+    {
+      view_type da("da", 1023, arg_total_size );
+      ASSERT_EQ( da.size(), 0 );
+      // Init
+      unsigned da_size = arg_total_size / 2;
+      da.resize_serial(da_size);
+      ASSERT_EQ( da.size(), da_size );
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+      Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, da_size), KOKKOS_LAMBDA ( const int i )
+          {
+          da(i) = Scalar(i);
+          }
+          );
+
+      value_type result_sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, da_size), KOKKOS_LAMBDA ( const int i, value_type& partial_sum )
+          {
+          partial_sum += (value_type)da(i);
+          }
+          , result_sum
+          );
+
+      ASSERT_EQ(result_sum, (value_type)( da_size * (da_size - 1) / 2 ) );
+#endif
+#endif
+
+      // remove the final 3/4 entries i.e. first 1/4 remain
+      unsigned da_resize = arg_total_size / 8;
+      da.resize_serial(da_resize);
+      ASSERT_EQ( da.size(), da_resize );
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+      Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, da_resize), KOKKOS_LAMBDA ( const int i )
+          {
+          da(i) = Scalar(i);
+          }
+          );
+
+      value_type new_result_sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, da_resize), KOKKOS_LAMBDA ( const int i, value_type& partial_sum )
+          {
+          partial_sum += (value_type)da(i);
+          }
+          , new_result_sum
+          );
+
+      ASSERT_EQ(new_result_sum, (value_type)( da_resize * (da_resize - 1) / 2 ) );
+#endif
+#endif
+    } // end scope
+
+  }
+};
+
+} // namespace Test
+
+#endif /* #ifndef KOKKOS_TEST_DYNAMICVIEW_HPP */
+
diff --git a/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp b/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b2cfdfc9a0ffff2468b233e4a7c0b6d406cbf89
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestErrorReporter.hpp
@@ -0,0 +1,228 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_EXPERIMENTAL_ERROR_REPORTER_HPP
+#define KOKKOS_TEST_EXPERIMENTAL_ERROR_REPORTER_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+// Just save the data in the report.  Informative text goies in the operator<<(..).
+template <typename DataType1, typename DataType2, typename DataType3>
+struct ThreeValReport
+{
+  DataType1 m_data1;
+  DataType2 m_data2;
+  DataType3 m_data3;
+
+};
+
+template <typename DataType1, typename DataType2, typename DataType3>
+std::ostream &operator<<(std::ostream & os, const ThreeValReport<DataType1, DataType2, DataType3> &val)
+{
+  return os << "{" << val.m_data1 << " " << val.m_data2 << " " << val.m_data3 << "}";
+}
+
+template<typename ReportType>
+void checkReportersAndReportsAgree(const std::vector<int> &reporters,
+                                   const std::vector<ReportType> &reports)
+{
+  for (size_t i = 0; i < reports.size(); ++i) {
+    EXPECT_EQ(1, reporters[i] % 2);
+    EXPECT_EQ(reporters[i], reports[i].m_data1);
+  }
+}
+
+
+template <typename DeviceType>
+struct ErrorReporterDriverBase {
+
+  typedef ThreeValReport<int, int, double>                                      report_type;
+  typedef Kokkos::Experimental::ErrorReporter<report_type, DeviceType>  error_reporter_type;
+  error_reporter_type m_errorReporter;
+
+  ErrorReporterDriverBase(int reporter_capacity, int test_size)
+    : m_errorReporter(reporter_capacity)  {  }
+
+  KOKKOS_INLINE_FUNCTION bool error_condition(const int work_idx) const { return (work_idx % 2 != 0); }
+
+  void check_expectations(int reporter_capacity, int test_size)
+  {
+    int num_reported = m_errorReporter.getNumReports();
+    int num_attempts = m_errorReporter.getNumReportAttempts();
+
+    int expected_num_reports = std::min(reporter_capacity, test_size / 2);
+    EXPECT_EQ(expected_num_reports, num_reported);
+    EXPECT_EQ(test_size / 2, num_attempts);
+
+    bool expect_full = (reporter_capacity <= (test_size / 2));
+    bool reported_full = m_errorReporter.full();
+    EXPECT_EQ(expect_full, reported_full);
+  }
+};
+
+template <typename ErrorReporterDriverType>
+void TestErrorReporter()
+{
+  typedef ErrorReporterDriverType tester_type;
+  std::vector<int> reporters;
+  std::vector<typename tester_type::report_type> reports;
+
+  tester_type test1(100, 10);
+  test1.m_errorReporter.getReports(reporters, reports);
+  checkReportersAndReportsAgree(reporters, reports);
+
+  tester_type test2(10, 100);
+  test2.m_errorReporter.getReports(reporters, reports);
+  checkReportersAndReportsAgree(reporters, reports);
+
+  typename Kokkos::View<int*, typename ErrorReporterDriverType::execution_space >::HostMirror view_reporters;
+  typename Kokkos::View<typename tester_type::report_type*, typename ErrorReporterDriverType::execution_space >::HostMirror
+     view_reports;
+  test2.m_errorReporter.getReports(view_reporters, view_reports);
+
+  int num_reports = view_reporters.extent(0);
+  reporters.clear();
+  reports.clear();
+  reporters.reserve(num_reports);
+  reports.reserve(num_reports);
+
+  for (int i = 0; i < num_reports; ++i) {
+    reporters.push_back(view_reporters(i));
+    reports.push_back(view_reports(i));
+  }
+  checkReportersAndReportsAgree(reporters, reports);
+
+}
+
+
+template <typename DeviceType>
+struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType>
+{
+  typedef ErrorReporterDriverBase<DeviceType>                             driver_base;
+  typedef typename driver_base::error_reporter_type::execution_space  execution_space;
+
+  ErrorReporterDriver(int reporter_capacity, int test_size)
+    : driver_base(reporter_capacity, test_size)
+  {
+    execute(reporter_capacity, test_size);
+
+    // Test that clear() and resize() work across memory spaces.
+    if (reporter_capacity < test_size) {
+      driver_base::m_errorReporter.clear();
+      driver_base::m_errorReporter.resize(test_size);
+      execute(test_size, test_size);
+    }
+  }
+
+  void execute(int reporter_capacity, int test_size)
+  {
+    Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0,test_size), *this);
+    driver_base::check_expectations(reporter_capacity, test_size);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int work_idx) const
+  {
+    if (driver_base::error_condition(work_idx)) {
+      double val = M_PI * static_cast<double>(work_idx);
+      typename driver_base::report_type report = {work_idx, -2*work_idx, val};
+      driver_base::m_errorReporter.add_report(work_idx, report);
+    }
+  }
+};
+
+#if defined(KOKKOS_CLASS_LAMBDA)
+template <typename DeviceType>
+struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase<DeviceType>
+{
+
+  typedef ErrorReporterDriverBase<DeviceType>                             driver_base;
+  typedef typename driver_base::error_reporter_type::execution_space  execution_space;
+
+  ErrorReporterDriverUseLambda(int reporter_capacity, int test_size)
+    : driver_base(reporter_capacity, test_size)
+  {
+    Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0,test_size), KOKKOS_CLASS_LAMBDA (const int work_idx) {
+      if (driver_base::error_condition(work_idx)) {
+        double val = M_PI * static_cast<double>(work_idx);
+        typename driver_base::report_type report = {work_idx, -2*work_idx, val};
+        driver_base::m_errorReporter.add_report(work_idx, report);
+      }
+    });
+    driver_base::check_expectations(reporter_capacity, test_size);
+  }
+
+};
+#endif
+
+
+#ifdef KOKKOS_ENABLE_OPENMP
+struct ErrorReporterDriverNativeOpenMP : public ErrorReporterDriverBase<Kokkos::OpenMP>
+{
+  typedef ErrorReporterDriverBase<Kokkos::OpenMP>  driver_base;
+  typedef typename driver_base::error_reporter_type::execution_space  execution_space;
+
+  ErrorReporterDriverNativeOpenMP(int reporter_capacity, int test_size)
+    : driver_base(reporter_capacity, test_size)
+  {
+#pragma omp parallel for
+    for(int work_idx = 0; work_idx < test_size; ++work_idx)
+    {
+      if (driver_base::error_condition(work_idx)) {
+        double val = M_PI * static_cast<double>(work_idx);
+        typename driver_base::report_type report = {work_idx, -2*work_idx, val};
+        driver_base::m_errorReporter.add_report(work_idx, report);
+      }
+    };
+    driver_base::check_expectations(reporter_capacity, test_size);
+  }
+};
+#endif
+
+} // namespace Test
+#endif // #ifndef KOKKOS_TEST_ERROR_REPORTING_HPP
+
diff --git a/packages/kokkos/containers/unit_tests/TestOpenMP.cpp b/packages/kokkos/containers/unit_tests/TestOpenMP.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d25a3da44b7aff34f26082273fa79d899f19b9dd
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestOpenMP.cpp
@@ -0,0 +1,212 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_OPENMP
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_Vector.hpp>
+
+//----------------------------------------------------------------------------
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestDynamicView.hpp>
+
+#include <Kokkos_DynRankView.hpp>
+#include <TestDynViewAPI.hpp>
+
+#include <TestScatterView.hpp>
+
+#include <Kokkos_ErrorReporter.hpp>
+#include <TestErrorReporter.hpp>
+
+#include <TestViewCtorPropEmbeddedDim.hpp>
+
+#include <iomanip>
+
+namespace Test {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+  }
+
+  static void TearDownTestCase()
+  {
+  }
+};
+
+TEST_F( openmp, dyn_view_api) {
+  TestDynViewAPI< double , Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::OpenMP >::test_vcpt( 2, 3 );
+}
+
+TEST_F( openmp, bitset )
+{
+  test_bitset<Kokkos::OpenMP>();
+}
+
+TEST_F( openmp , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::OpenMP >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::OpenMP >();
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(1, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(1, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(1, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(1, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(3, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(3, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(3, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(3, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(75, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(75, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(75, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::OpenMP >(75, 100000);
+}
+
+#define OPENMP_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near )                                \
+  TEST_F( openmp, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) {   \
+    for (int i=0; i<repeat; ++i)                                                                                \
+      test_insert<Kokkos::OpenMP>(num_nodes,num_inserts,num_duplicates, near);                                   \
+  }
+
+#define OPENMP_FAILED_INSERT_TEST( num_nodes, repeat )                         \
+  TEST_F( openmp, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) {     \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_failed_insert<Kokkos::OpenMP>(num_nodes);                             \
+  }
+
+#define OPENMP_ASSIGNEMENT_TEST( num_nodes, repeat )                             \
+  TEST_F( openmp, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_assignement_operators<Kokkos::OpenMP>(num_nodes);                     \
+  }
+
+#define OPENMP_DEEP_COPY( num_nodes, repeat )                             \
+  TEST_F( openmp, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_deep_copy<Kokkos::OpenMP>(num_nodes);                     \
+  }
+
+#define OPENMP_VECTOR_COMBINE_TEST( size )                             \
+  TEST_F( openmp, vector_combination##size##x) {       \
+      test_vector_combinations<int,Kokkos::OpenMP>(size);                     \
+  }
+
+#define OPENMP_DUALVIEW_COMBINE_TEST( size )                             \
+  TEST_F( openmp, dualview_combination##size##x) {       \
+      test_dualview_combinations<int,Kokkos::OpenMP>(size);                     \
+  }
+
+#define OPENMP_SCATTERVIEW_TEST( size )             \
+  TEST_F( openmp, scatterview_##size##x) {                      \
+    test_scatter_view<Kokkos::OpenMP>(size);               \
+  }
+
+OPENMP_INSERT_TEST(close, 100000, 90000, 100, 500, true)
+OPENMP_INSERT_TEST(far, 100000, 90000, 100, 500, false)
+OPENMP_FAILED_INSERT_TEST( 10000, 1000 )
+OPENMP_DEEP_COPY( 10000, 1 )
+
+OPENMP_VECTOR_COMBINE_TEST( 10 )
+OPENMP_VECTOR_COMBINE_TEST( 3057 )
+OPENMP_DUALVIEW_COMBINE_TEST( 10 )
+
+OPENMP_SCATTERVIEW_TEST( 10 )
+
+OPENMP_SCATTERVIEW_TEST( 1000000 )
+
+#undef OPENMP_INSERT_TEST
+#undef OPENMP_FAILED_INSERT_TEST
+#undef OPENMP_ASSIGNEMENT_TEST
+#undef OPENMP_DEEP_COPY
+#undef OPENMP_VECTOR_COMBINE_TEST
+#undef OPENMP_DUALVIEW_COMBINE_TEST
+
+
+TEST_F( openmp , dynamic_view )
+{
+  typedef TestDynamicView< double , Kokkos::OpenMP >
+    TestDynView ;
+
+  for ( int i = 0 ; i < 10 ; ++i ) {
+    TestDynView::run( 100000 + 100 * i );
+  }
+}
+
+#if defined(KOKKOS_CLASS_LAMBDA)
+TEST_F(openmp, ErrorReporterViaLambda)
+{
+  TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::OpenMP>>();
+}
+#endif
+
+TEST_F(openmp, ErrorReporter)
+{
+  TestErrorReporter<ErrorReporterDriver<Kokkos::OpenMP>>();
+}
+
+TEST_F(openmp, ErrorReporterNativeOpenMP)
+{
+  TestErrorReporter<ErrorReporterDriverNativeOpenMP>();
+}
+
+} // namespace test
+
+#else
+void KOKKOS_CONTAINERS_UNIT_TESTS_TESTOPENMP_PREVENT_EMPTY_LINK_ERROR() {}
+#endif
+
diff --git a/packages/kokkos/containers/unit_tests/TestROCm.cpp b/packages/kokkos/containers/unit_tests/TestROCm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aaf7a89560d2fa752762d86cf4e9abc267bdfddb
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestROCm.cpp
@@ -0,0 +1,263 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_ROCM
+
+#include <iostream>
+#include <iomanip>
+#include <cstdint>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_Vector.hpp>
+
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestDynamicView.hpp>
+
+#include <Kokkos_DynRankView.hpp>
+#include <TestDynViewAPI.hpp>
+
+#include <Kokkos_ErrorReporter.hpp>
+#include <TestErrorReporter.hpp>
+
+#include <TestViewCtorPropEmbeddedDim.hpp>
+
+//----------------------------------------------------------------------------
+
+
+
+namespace Test {
+
+class rocm : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Experimental::ROCm::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+#if !defined(KOKKOS_ENABLE_ROCM)
+//issue 964
+TEST_F( rocm , dyn_view_api) {
+  TestDynViewAPI< double , Kokkos::Experimental::ROCm >();
+}
+#endif 
+
+TEST_F( rocm, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Experimental::ROCm >::test_vcpt( 2, 3 );
+}
+
+TEST_F( rocm , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::Experimental::ROCm >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::Experimental::ROCm >();
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 100000);
+}
+
+
+#if !defined(KOKKOS_ENABLE_ROCM)
+// issue 1089
+// same as 130203 (MemPool, static member function link issue
+void rocm_test_insert_close(  uint32_t num_nodes
+                            , uint32_t num_inserts
+                            , uint32_t num_duplicates
+                           )
+{
+  test_insert< Kokkos::Experimental::ROCm >( num_nodes, num_inserts, num_duplicates, true);
+}
+
+// hcc link error , Referencing function in another module!
+void rocm_test_insert_far(  uint32_t num_nodes
+                          , uint32_t num_inserts
+                          , uint32_t num_duplicates
+                         )
+{
+  test_insert< Kokkos::Experimental::ROCm >( num_nodes, num_inserts, num_duplicates, false);
+}
+
+void rocm_test_failed_insert(  uint32_t num_nodes )
+{
+  test_failed_insert< Kokkos::Experimental::ROCm >( num_nodes );
+}
+
+void rocm_test_deep_copy(  uint32_t num_nodes )
+{
+  test_deep_copy< Kokkos::Experimental::ROCm >( num_nodes );
+}
+
+void rocm_test_vector_combinations(unsigned int size)
+{
+  test_vector_combinations<int,Kokkos::Experimental::ROCm>(size);
+}
+
+void rocm_test_dualview_combinations(unsigned int size)
+{
+  test_dualview_combinations<int,Kokkos::Experimental::ROCm>(size);
+}
+
+void rocm_test_bitset()
+{
+  test_bitset<Kokkos::Experimental::ROCm>();
+}
+
+
+
+/*TEST_F( rocm, bitset )
+{
+  rocm_test_bitset();
+}*/
+
+#define ROCM_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat )                                \
+  TEST_F( rocm, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) {   \
+    for (int i=0; i<repeat; ++i)                                                                                \
+      rocm_test_insert_##name(num_nodes,num_inserts,num_duplicates);                                            \
+  }
+
+#define ROCM_FAILED_INSERT_TEST( num_nodes, repeat )                           \
+  TEST_F( rocm, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      rocm_test_failed_insert(num_nodes);                                      \
+  }
+
+#define ROCM_ASSIGNEMENT_TEST( num_nodes, repeat )                               \
+  TEST_F( rocm, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) {  \
+    for (int i=0; i<repeat; ++i)                                                 \
+      rocm_test_assignment_operators(num_nodes);                                 \
+  }
+
+#define ROCM_DEEP_COPY( num_nodes, repeat )                             \
+  TEST_F( rocm, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      rocm_test_deep_copy(num_nodes);                     \
+  }
+
+#define ROCM_VECTOR_COMBINE_TEST( size )                             \
+  TEST_F( rocm, vector_combination##size##x) {       \
+      rocm_test_vector_combinations(size);                     \
+  }
+
+#define ROCM_DUALVIEW_COMBINE_TEST( size )                             \
+  TEST_F( rocm, dualview_combination##size##x) {       \
+      rocm_test_dualview_combinations(size);                     \
+  }
+
+//ROCM_DUALVIEW_COMBINE_TEST( 10 )
+//ROCM_VECTOR_COMBINE_TEST( 10 )
+//ROCM_VECTOR_COMBINE_TEST( 3057 )
+
+
+//ROCM_INSERT_TEST(close,               100000, 90000, 100, 500)
+//ROCM_INSERT_TEST(far,                 100000, 90000, 100, 500)
+//ROCM_DEEP_COPY( 10000, 1 )
+//ROCM_FAILED_INSERT_TEST( 10000, 1000 )
+
+
+#undef ROCM_INSERT_TEST
+#undef ROCM_FAILED_INSERT_TEST
+#undef ROCM_ASSIGNEMENT_TEST
+#undef ROCM_DEEP_COPY
+#undef ROCM_VECTOR_COMBINE_TEST
+#undef ROCM_DUALVIEW_COMBINE_TEST
+
+
+#endif
+#if !defined(KOKKOS_ENABLE_ROCM)
+//static member function issue 
+TEST_F( rocm , dynamic_view )
+{
+//  typedef TestDynamicView< double , Kokkos::ROCmUVMSpace >
+  typedef TestDynamicView< double , Kokkos::Experimental::ROCmSpace >
+    TestDynView ;
+
+  for ( int i = 0 ; i < 10 ; ++i ) {
+    TestDynView::run( 100000 + 100 * i );
+  }
+}
+#endif
+
+
+#if defined(KOKKOS_CLASS_LAMBDA)
+TEST_F(rocm, ErrorReporterViaLambda)
+{
+  TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Experimental::ROCm>>();
+}
+#endif
+
+TEST_F(rocm, ErrorReporter)
+{
+  TestErrorReporter<ErrorReporterDriver<Kokkos::Experimental::ROCm>>();
+}
+
+}
+
+#else
+void KOKKOS_CONTAINERS_UNIT_TESTS_TESTROCM_PREVENT_EMPTY_LINK_ERROR() {}
+#endif  /* #ifdef KOKKOS_ENABLE_ROCM */
+
diff --git a/packages/kokkos/containers/unit_tests/TestScatterView.hpp b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ec7e8f4303e66742fba986cccbbd11401d68bc72
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
@@ -0,0 +1,156 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_SCATTER_VIEW_HPP
+#define KOKKOS_TEST_SCATTER_VIEW_HPP
+
+#include <Kokkos_ScatterView.hpp>
+
+namespace Test {
+
+template <typename ExecSpace, typename Layout, int duplication, int contribution>
+void test_scatter_view_config(int n)
+{
+  Kokkos::View<double *[3], Layout, ExecSpace> original_view("original_view", n);
+  {
+    auto scatter_view = Kokkos::Experimental::create_scatter_view
+      < Kokkos::Experimental::ScatterSum
+      , duplication
+      , contribution
+      > (original_view);
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+    auto policy = Kokkos::RangePolicy<ExecSpace, int>(0, n);
+    auto f = KOKKOS_LAMBDA(int i) {
+      auto scatter_access = scatter_view.access();
+      auto scatter_access_atomic = scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
+      for (int j = 0; j < 10; ++j) {
+        auto k = (i + j) % n;
+        scatter_access(k, 0) += 4.2;
+        scatter_access_atomic(k, 1) += 2.0;
+        scatter_access(k, 2) += 1.0;
+      }
+    };
+    Kokkos::parallel_for(policy, f, "scatter_view_test");
+#endif
+    Kokkos::Experimental::contribute(original_view, scatter_view);
+    scatter_view.reset_except(original_view);
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+    Kokkos::parallel_for(policy, f, "scatter_view_test");
+#endif
+    Kokkos::Experimental::contribute(original_view, scatter_view);
+  }
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+  auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), original_view);
+  for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
+    auto val0 = host_view(i, 0);
+    auto val1 = host_view(i, 1);
+    auto val2 = host_view(i, 2);
+    EXPECT_TRUE(std::fabs((val0 - 84.0) / 84.0) < 1e-15);
+    EXPECT_TRUE(std::fabs((val1 - 40.0) / 40.0) < 1e-15);
+    EXPECT_TRUE(std::fabs((val2 - 20.0) / 20.0) < 1e-15);
+  }
+#endif
+  {
+    Kokkos::Experimental::ScatterView
+      < double*[3]
+      , Layout
+      , ExecSpace
+      , Kokkos::Experimental::ScatterSum
+      , duplication
+      , contribution
+      >
+      persistent_view("persistent", n);
+    auto result_view = persistent_view.subview();
+    contribute(result_view, persistent_view);
+  }
+}
+
+template <typename ExecSpace>
+struct TestDuplicatedScatterView {
+  TestDuplicatedScatterView(int n) {
+    test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
+      Kokkos::Experimental::ScatterDuplicated,
+      Kokkos::Experimental::ScatterNonAtomic>(n);
+    test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
+      Kokkos::Experimental::ScatterDuplicated,
+      Kokkos::Experimental::ScatterAtomic>(n);
+  }
+};
+
+#ifdef KOKKOS_ENABLE_CUDA
+// disable duplicated instantiation with CUDA until
+// UniqueToken can support it
+template <>
+struct TestDuplicatedScatterView<Kokkos::Cuda> {
+  TestDuplicatedScatterView(int) {
+  }
+};
+#endif
+
+template <typename ExecSpace>
+void test_scatter_view(int n)
+{
+  // all of these configurations should compile okay, but only some of them are
+  // correct and/or sensible in terms of memory use
+  Kokkos::Experimental::UniqueToken<ExecSpace> unique_token{ExecSpace()};
+
+  // no atomics or duplication is only sensible if the execution space
+  // is running essentially in serial (doesn't have to be Serial though,
+  // we also test OpenMP with one thread: LAMMPS cares about that)
+  if (unique_token.size() == 1) {
+    test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
+      Kokkos::Experimental::ScatterNonDuplicated,
+      Kokkos::Experimental::ScatterNonAtomic>(n);
+  }
+  test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
+    Kokkos::Experimental::ScatterNonDuplicated,
+    Kokkos::Experimental::ScatterAtomic>(n);
+
+  TestDuplicatedScatterView<ExecSpace> duptest(n);
+}
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
+
+
diff --git a/packages/kokkos/containers/unit_tests/TestSerial.cpp b/packages/kokkos/containers/unit_tests/TestSerial.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f9b201119cfd8b57ab74d383c5a981d1d873077
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestSerial.cpp
@@ -0,0 +1,203 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_SERIAL
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_Vector.hpp>
+
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestDynamicView.hpp>
+#include <TestScatterView.hpp>
+
+#include <iomanip>
+
+#include <Kokkos_DynRankView.hpp>
+#include <TestDynViewAPI.hpp>
+
+#include <Kokkos_ErrorReporter.hpp>
+#include <TestErrorReporter.hpp>
+
+#include <TestViewCtorPropEmbeddedDim.hpp>
+
+namespace Test {
+
+class serial : public ::testing::Test {
+protected:
+  static void SetUpTestCase () {
+    std::cout << std::setprecision(5) << std::scientific;
+  }
+
+  static void TearDownTestCase () {
+  }
+};
+
+TEST_F( serial, dyn_view_api) {
+  TestDynViewAPI< double , Kokkos::Serial >();
+}
+
+TEST_F( serial, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Serial >::test_vcpt( 2, 3 );
+}
+
+TEST_F( serial , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::Serial >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::Serial >();
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(1, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(1, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(1, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(1, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(3, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(3, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(3, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(3, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(75, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(75, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(75, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Serial >(75, 100000);
+}
+
+TEST_F( serial, bitset )
+{
+  test_bitset<Kokkos::Serial> ();
+}
+
+#define SERIAL_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near ) \
+  TEST_F( serial, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) { \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_insert<Kokkos::Serial> (num_nodes, num_inserts, num_duplicates, near); \
+  }
+
+#define SERIAL_FAILED_INSERT_TEST( num_nodes, repeat )                  \
+  TEST_F( serial, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) { \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_failed_insert<Kokkos::Serial> (num_nodes);                   \
+  }
+
+#define SERIAL_ASSIGNEMENT_TEST( num_nodes, repeat )                    \
+  TEST_F( serial, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) { \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_assignement_operators<Kokkos::Serial> (num_nodes);           \
+  }
+
+#define SERIAL_DEEP_COPY( num_nodes, repeat )                           \
+  TEST_F( serial, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {    \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_deep_copy<Kokkos::Serial> (num_nodes);                       \
+  }
+
+#define SERIAL_VECTOR_COMBINE_TEST( size )             \
+  TEST_F( serial, vector_combination##size##x) {                        \
+    test_vector_combinations<int,Kokkos::Serial>(size);                 \
+  }
+
+#define SERIAL_DUALVIEW_COMBINE_TEST( size )             \
+  TEST_F( serial, dualview_combination##size##x) {                      \
+    test_dualview_combinations<int,Kokkos::Serial>(size);               \
+  }
+
+#define SERIAL_SCATTERVIEW_TEST( size )             \
+  TEST_F( serial, scatterview_##size##x) {                      \
+    test_scatter_view<Kokkos::Serial>(size);               \
+  }
+
+SERIAL_INSERT_TEST(close, 100000, 90000, 100, 500, true)
+SERIAL_INSERT_TEST(far, 100000, 90000, 100, 500, false)
+SERIAL_FAILED_INSERT_TEST( 10000, 1000 )
+SERIAL_DEEP_COPY( 10000, 1 )
+
+SERIAL_VECTOR_COMBINE_TEST( 10 )
+SERIAL_VECTOR_COMBINE_TEST( 3057 )
+SERIAL_DUALVIEW_COMBINE_TEST( 10 )
+
+SERIAL_SCATTERVIEW_TEST( 10 )
+
+SERIAL_SCATTERVIEW_TEST( 1000000 )
+
+#undef SERIAL_INSERT_TEST
+#undef SERIAL_FAILED_INSERT_TEST
+#undef SERIAL_ASSIGNEMENT_TEST
+#undef SERIAL_DEEP_COPY
+#undef SERIAL_VECTOR_COMBINE_TEST
+#undef SERIAL_DUALVIEW_COMBINE_TEST
+
+TEST_F( serial , dynamic_view )
+{
+  typedef TestDynamicView< double , Kokkos::Serial >
+    TestDynView ;
+
+  for ( int i = 0 ; i < 10 ; ++i ) {
+    TestDynView::run( 100000 + 100 * i );
+  }
+}
+
+#if defined(KOKKOS_CLASS_LAMBDA)
+TEST_F(serial, ErrorReporterViaLambda)
+{
+  TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Serial>>();
+}
+#endif
+
+TEST_F(serial, ErrorReporter)
+{
+  TestErrorReporter<ErrorReporterDriver<Kokkos::Serial>>();
+}
+
+
+} // namespace Test
+
+#else
+void KOKKOS_CONTAINERS_UNIT_TESTS_TESTSERIAL_PREVENT_EMPTY_LINK_ERROR() {}
+#endif // KOKKOS_ENABLE_SERIAL
+
diff --git a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..95a2f3a2572fd6b4cdac5b13eb85389deb987aa3
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@@ -0,0 +1,197 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include <Kokkos_StaticCrsGraph.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace TestStaticCrsGraph {
+
+template< class Space >
+void run_test_graph()
+{
+  typedef Kokkos::StaticCrsGraph< unsigned , Space > dView ;
+  typedef typename dView::HostMirror hView ;
+
+  const unsigned LENGTH = 1000 ;
+  dView dx ;
+  hView hx ;
+
+  std::vector< std::vector< int > > graph( LENGTH );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    graph[i].reserve(8);
+    for ( size_t j = 0 ; j < 8 ; ++j ) {
+      graph[i].push_back( i + j * 3 );
+    }
+  }
+
+  dx = Kokkos::create_staticcrsgraph<dView>( "dx" , graph );
+  hx = Kokkos::create_mirror( dx );
+
+  ASSERT_EQ( hx.row_map.extent(0) - 1 , LENGTH );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    const size_t begin = hx.row_map[i];
+    const size_t n = hx.row_map[i+1] - begin ;
+    ASSERT_EQ( n , graph[i].size() );
+    for ( size_t j = 0 ; j < n ; ++j ) {
+      ASSERT_EQ( (int) hx.entries( j + begin ) , graph[i][j] );
+    }
+  }
+
+  // Test row view access
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    auto rowView = hx.rowConst(i);
+    ASSERT_EQ( rowView.length, graph[i].size() );
+    for ( size_t j = 0 ; j < rowView.length ; ++j ) {
+      ASSERT_EQ( rowView.colidx( j ) , graph[i][j] );
+      ASSERT_EQ( rowView( j )        , graph[i][j] );
+    }
+  }
+}
+
+template< class Space >
+void run_test_graph2()
+{
+  typedef Kokkos::StaticCrsGraph< unsigned[3] , Space > dView ;
+  typedef typename dView::HostMirror hView ;
+
+  const unsigned LENGTH = 10 ;
+
+  std::vector< size_t > sizes( LENGTH );
+
+  size_t total_length = 0 ;
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    total_length += ( sizes[i] = 6 + i % 4 );
+  }
+
+  dView dx = Kokkos::create_staticcrsgraph<dView>( "test" , sizes );
+  hView hx = Kokkos::create_mirror( dx );
+  hView mx = Kokkos::create_mirror( dx );
+
+  ASSERT_EQ( (size_t) dx.row_map.extent(0) , (size_t) LENGTH + 1 );
+  ASSERT_EQ( (size_t) hx.row_map.extent(0) , (size_t) LENGTH + 1 );
+  ASSERT_EQ( (size_t) mx.row_map.extent(0) , (size_t) LENGTH + 1 );
+
+  ASSERT_EQ( (size_t) dx.entries.extent(0) , (size_t) total_length );
+  ASSERT_EQ( (size_t) hx.entries.extent(0) , (size_t) total_length );
+  ASSERT_EQ( (size_t) mx.entries.extent(0) , (size_t) total_length );
+
+  ASSERT_EQ( (size_t) dx.entries.extent(1) , (size_t) 3 );
+  ASSERT_EQ( (size_t) hx.entries.extent(1) , (size_t) 3 );
+  ASSERT_EQ( (size_t) mx.entries.extent(1) , (size_t) 3 );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    const size_t entry_begin = hx.row_map[i];
+    const size_t entry_end   = hx.row_map[i+1];
+    for ( size_t j = entry_begin ; j < entry_end ; ++j ) {
+      hx.entries(j,0) = j + 1 ;
+      hx.entries(j,1) = j + 2 ;
+      hx.entries(j,2) = j + 3 ;
+    }
+  }
+
+  Kokkos::deep_copy( dx.entries , hx.entries );
+  Kokkos::deep_copy( mx.entries , dx.entries );
+
+  ASSERT_EQ( mx.row_map.extent(0) , (size_t) LENGTH + 1 );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    const size_t entry_begin = mx.row_map[i];
+    const size_t entry_end   = mx.row_map[i+1];
+    ASSERT_EQ( ( entry_end - entry_begin ) , sizes[i] );
+    for ( size_t j = entry_begin ; j < entry_end ; ++j ) {
+      ASSERT_EQ( (size_t) mx.entries( j , 0 ) , ( j + 1 ) );
+      ASSERT_EQ( (size_t) mx.entries( j , 1 ) , ( j + 2 ) );
+      ASSERT_EQ( (size_t) mx.entries( j , 2 ) , ( j + 3 ) );
+    }
+  }
+}
+
+template< class Space >
+void run_test_graph3(size_t B, size_t N)
+{
+  srand(10310);
+
+  typedef Kokkos::StaticCrsGraph< int , Space > dView ;
+  typedef typename dView::HostMirror hView ;
+
+  const unsigned LENGTH = 2000 ;
+
+  std::vector< size_t > sizes( LENGTH );
+
+  size_t total_length = 0 ;
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    sizes[i] = rand()%1000;
+  }
+
+  sizes[1] = N;
+  sizes[1998] = N;
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    total_length += sizes[i];
+  }
+
+  int C = 0;
+  dView dx = Kokkos::create_staticcrsgraph<dView>( "test" , sizes );
+  dx.create_block_partitioning(B,C);
+  hView hx = Kokkos::create_mirror( dx );
+
+  for( size_t i = 0; i<B; i++) {
+    size_t ne = 0;
+    for(size_t j = hx.row_block_offsets(i); j<hx.row_block_offsets(i+1); j++)
+      ne += hx.row_map(j+1)-hx.row_map(j)+C;
+
+    ASSERT_FALSE((ne>2*((hx.row_map(hx.numRows())+C*hx.numRows())/B))&&(hx.row_block_offsets(i+1)>hx.row_block_offsets(i)+1));
+  }
+}
+
+} /* namespace TestStaticCrsGraph */
+
diff --git a/packages/kokkos/containers/unit_tests/TestThreads.cpp b/packages/kokkos/containers/unit_tests/TestThreads.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1086e235fb0658487ed42dd7ad4c424b3a92f6cc
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestThreads.cpp
@@ -0,0 +1,199 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_THREADS
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <Kokkos_Vector.hpp>
+#include <iomanip>
+
+
+//----------------------------------------------------------------------------
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestDynamicView.hpp>
+
+#include <Kokkos_DynRankView.hpp>
+#include <TestDynViewAPI.hpp>
+
+#include <Kokkos_ErrorReporter.hpp>
+#include <TestErrorReporter.hpp>
+
+#include <TestViewCtorPropEmbeddedDim.hpp>
+
+namespace Test {
+
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+  }
+
+  static void TearDownTestCase()
+  {
+  }
+};
+
+TEST_F( threads , dyn_view_api) {
+  TestDynViewAPI< double , Kokkos::Threads >();
+}
+
+TEST_F( threads, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Threads >::test_vcpt( 2, 3 );
+}
+
+TEST_F( threads , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::Threads >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::Threads >();
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(1, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(1, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(1, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(1, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(3, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(3, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(3, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(3, 100000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(75, 0);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(75, 1000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(75, 10000);
+  TestStaticCrsGraph::run_test_graph3< Kokkos::Threads >(75, 100000);
+}
+
+/*TEST_F( threads, bitset )
+{
+  test_bitset<Kokkos::Threads>();
+}*/
+
+#define THREADS_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near )                                \
+  TEST_F( threads, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) {   \
+    for (int i=0; i<repeat; ++i)                                                                                \
+      test_insert<Kokkos::Threads>(num_nodes,num_inserts,num_duplicates, near);                                   \
+  }
+
+#define THREADS_FAILED_INSERT_TEST( num_nodes, repeat )                            \
+  TEST_F( threads, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_failed_insert<Kokkos::Threads>(num_nodes);                             \
+  }
+
+#define THREADS_ASSIGNEMENT_TEST( num_nodes, repeat )                             \
+  TEST_F( threads, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_assignement_operators<Kokkos::Threads>(num_nodes);                     \
+  }
+
+#define THREADS_DEEP_COPY( num_nodes, repeat )                             \
+  TEST_F( threads, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_deep_copy<Kokkos::Threads>(num_nodes);                     \
+  }
+
+#define THREADS_VECTOR_COMBINE_TEST( size )                             \
+  TEST_F( threads, vector_combination##size##x) {       \
+      test_vector_combinations<int,Kokkos::Threads>(size);                     \
+  }
+
+#define THREADS_DUALVIEW_COMBINE_TEST( size )                             \
+  TEST_F( threads, dualview_combination##size##x) {       \
+      test_dualview_combinations<int,Kokkos::Threads>(size);                     \
+  }
+
+THREADS_INSERT_TEST(far, 100000, 90000, 100, 500, false)
+THREADS_FAILED_INSERT_TEST( 10000, 1000 )
+THREADS_DEEP_COPY( 10000, 1 )
+
+THREADS_VECTOR_COMBINE_TEST( 10 )
+THREADS_VECTOR_COMBINE_TEST( 3057 )
+THREADS_DUALVIEW_COMBINE_TEST( 10 )
+
+
+#undef THREADS_INSERT_TEST
+#undef THREADS_FAILED_INSERT_TEST
+#undef THREADS_ASSIGNEMENT_TEST
+#undef THREADS_DEEP_COPY
+#undef THREADS_VECTOR_COMBINE_TEST
+#undef THREADS_DUALVIEW_COMBINE_TEST
+
+
+TEST_F( threads , dynamic_view )
+{
+  typedef TestDynamicView< double , Kokkos::Threads >
+    TestDynView ;
+
+  for ( int i = 0 ; i < 10 ; ++i ) {
+    TestDynView::run( 100000 + 100 * i );
+  }
+}
+
+
+#if defined(KOKKOS_CLASS_LAMBDA)
+TEST_F(threads, ErrorReporterViaLambda)
+{
+  TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Threads>>();
+}
+#endif
+
+TEST_F(threads, ErrorReporter)
+{
+  TestErrorReporter<ErrorReporterDriver<Kokkos::Threads>>();
+}
+
+} // namespace Test
+
+#else
+void KOKKOS_CONTAINERS_UNIT_TESTS_TESTTHREADS_PREVENT_EMPTY_LINK_ERROR() {}
+#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
+
diff --git a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c964bbc1cf666a42104c69660b54ff311259780e
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@@ -0,0 +1,314 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_UNORDERED_MAP_HPP
+#define KOKKOS_TEST_UNORDERED_MAP_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+
+namespace Test {
+
+namespace Impl {
+
+template <typename MapType, bool Near = false>
+struct TestInsert
+{
+  typedef MapType map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  map_type map;
+  uint32_t inserts;
+  uint32_t collisions;
+
+  TestInsert( map_type arg_map, uint32_t arg_inserts, uint32_t arg_collisions)
+    : map(arg_map)
+    , inserts(arg_inserts)
+    , collisions(arg_collisions)
+  {}
+
+  void testit( bool rehash_on_fail = true )
+  {
+    execution_space::fence();
+
+    uint32_t failed_count = 0;
+    do {
+      failed_count = 0;
+      Kokkos::parallel_reduce(inserts, *this, failed_count);
+
+      if (rehash_on_fail && failed_count > 0u) {
+        const uint32_t new_capacity = map.capacity() + ((map.capacity()*3ull)/20u) + failed_count/collisions ;
+        map.rehash( new_capacity );
+      }
+    } while (rehash_on_fail && failed_count > 0u);
+
+    execution_space::fence();
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & failed_count ) const { failed_count = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & failed_count, const volatile value_type & count ) const
+  { failed_count += count; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & failed_count) const
+  {
+    const uint32_t key = Near ? i/collisions : i%(inserts/collisions);
+    if (map.insert(key,i).failed()) ++failed_count;
+  }
+
+};
+
+  template <typename MapType, bool Near>
+  struct TestErase
+  {
+    typedef TestErase<MapType, Near> self_type;
+
+    typedef MapType map_type;
+    typedef typename MapType::execution_space execution_space;
+
+    map_type m_map;
+    uint32_t m_num_erase;
+    uint32_t m_num_duplicates;
+
+    TestErase(map_type map, uint32_t num_erases, uint32_t num_duplicates)
+      : m_map(map)
+      , m_num_erase(num_erases)
+      , m_num_duplicates(num_duplicates)
+    {}
+
+    void testit()
+    {
+      execution_space::fence();
+      Kokkos::parallel_for(m_num_erase, *this);
+      execution_space::fence();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(typename execution_space::size_type i) const
+    {
+      if (Near) {
+        m_map.erase(i/m_num_duplicates);
+      }
+      else {
+        m_map.erase(i%(m_num_erase/m_num_duplicates));
+      }
+
+    }
+  };
+
+  template <typename MapType>
+  struct TestFind
+  {
+    typedef MapType map_type;
+    typedef typename MapType::execution_space::execution_space execution_space;
+    typedef uint32_t value_type;
+
+    map_type m_map;
+    uint32_t m_num_insert;
+    uint32_t m_num_duplicates;
+    uint32_t m_max_key;
+
+    TestFind(map_type map, uint32_t num_inserts, uint32_t num_duplicates)
+      : m_map(map)
+      , m_num_insert(num_inserts)
+      , m_num_duplicates(num_duplicates)
+      , m_max_key( ((num_inserts + num_duplicates) - 1)/num_duplicates )
+    {}
+
+    void testit(value_type &errors)
+    {
+      execution_space::execution_space::fence();
+      Kokkos::parallel_reduce(m_map.capacity(), *this, errors);
+      execution_space::execution_space::fence();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    static void init( value_type & dst)
+    {
+      dst = 0;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    static void join( volatile value_type & dst, const volatile value_type & src)
+    { dst += src; }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(typename execution_space::size_type i, value_type & errors) const
+    {
+      const bool expect_to_find_i = (i < m_max_key);
+
+      const bool exists = m_map.exists(i);
+
+      if (expect_to_find_i && !exists)  ++errors;
+      if (!expect_to_find_i && exists)  ++errors;
+    }
+  };
+
+} // namespace Impl
+
+
+
+template <typename Device>
+void test_insert( uint32_t num_nodes , uint32_t num_inserts , uint32_t num_duplicates , bool near )
+{
+  typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type;
+  typedef Kokkos::UnorderedMap<const uint32_t,const uint32_t, Device> const_map_type;
+
+  const uint32_t expected_inserts = (num_inserts + num_duplicates -1u) / num_duplicates;
+
+  map_type map;
+  map.rehash(num_nodes,false);
+
+  if (near) {
+    Impl::TestInsert<map_type,true> test_insert(map, num_inserts, num_duplicates);
+    test_insert.testit();
+  } else
+  {
+    Impl::TestInsert<map_type,false> test_insert(map, num_inserts, num_duplicates);
+    test_insert.testit();
+  }
+
+  const bool print_list = false;
+  if (print_list) {
+    Kokkos::Impl::UnorderedMapPrint<map_type> f(map);
+    f.apply();
+  }
+
+  const uint32_t map_size = map.size();
+
+  ASSERT_FALSE( map.failed_insert());
+  {
+    EXPECT_EQ(expected_inserts, map_size);
+
+    {
+      uint32_t find_errors = 0;
+      Impl::TestFind<const_map_type> test_find(map, num_inserts, num_duplicates);
+      test_find.testit(find_errors);
+      EXPECT_EQ( 0u, find_errors);
+    }
+
+    map.begin_erase();
+    Impl::TestErase<map_type,false> test_erase(map, num_inserts, num_duplicates);
+    test_erase.testit();
+    map.end_erase();
+    EXPECT_EQ(0u, map.size());
+  }
+}
+
+template <typename Device>
+void test_failed_insert( uint32_t num_nodes)
+{
+  typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type;
+
+  map_type map(num_nodes);
+  Impl::TestInsert<map_type> test_insert(map, 2u*num_nodes, 1u);
+  test_insert.testit(false /*don't rehash on fail*/);
+  Device::execution_space::fence();
+
+  EXPECT_TRUE( map.failed_insert() );
+}
+
+
+
+template <typename Device>
+void test_deep_copy( uint32_t num_nodes )
+{
+  typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type;
+  typedef Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device> const_map_type;
+
+  typedef typename map_type::HostMirror host_map_type ;
+  // typedef Kokkos::UnorderedMap<uint32_t, uint32_t, typename Device::host_mirror_execution_space > host_map_type;
+
+  map_type map;
+  map.rehash(num_nodes,false);
+
+  {
+    Impl::TestInsert<map_type> test_insert(map, num_nodes, 1);
+    test_insert.testit();
+    ASSERT_EQ( map.size(), num_nodes);
+    ASSERT_FALSE( map.failed_insert() );
+    {
+      uint32_t find_errors = 0;
+      Impl::TestFind<map_type> test_find(map, num_nodes, 1);
+      test_find.testit(find_errors);
+      EXPECT_EQ( find_errors, 0u);
+    }
+
+  }
+
+  host_map_type hmap;
+  Kokkos::deep_copy(hmap, map);
+
+  ASSERT_EQ( map.size(), hmap.size());
+  ASSERT_EQ( map.capacity(), hmap.capacity());
+  {
+    uint32_t find_errors = 0;
+    Impl::TestFind<host_map_type> test_find(hmap, num_nodes, 1);
+    test_find.testit(find_errors);
+    EXPECT_EQ( find_errors, 0u);
+  }
+
+  map_type mmap;
+  Kokkos::deep_copy(mmap, hmap);
+
+  const_map_type cmap = mmap;
+
+  EXPECT_EQ( cmap.size(), num_nodes);
+
+  {
+    uint32_t find_errors = 0;
+    Impl::TestFind<const_map_type> test_find(cmap, num_nodes, 1);
+    test_find.testit(find_errors);
+    EXPECT_EQ( find_errors, 0u);
+  }
+
+}
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
+
diff --git a/packages/kokkos/containers/unit_tests/TestVector.hpp b/packages/kokkos/containers/unit_tests/TestVector.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ddde3df18a7fa8439a80fe6257b519c54d92d890
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestVector.hpp
@@ -0,0 +1,132 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_VECTOR_HPP
+#define KOKKOS_TEST_VECTOR_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <impl/Kokkos_Timer.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+  template <typename Scalar, class Device>
+  struct test_vector_combinations
+  {
+    typedef test_vector_combinations<Scalar,Device> self_type;
+
+    typedef Scalar scalar_type;
+    typedef Device execution_space;
+
+    Scalar reference;
+    Scalar result;
+
+    template <typename Vector>
+    Scalar run_me(unsigned int n){
+      Vector a(n,1);
+
+
+      a.push_back(2);
+      a.resize(n+4);
+      a[n+1] = 3;
+      a[n+2] = 4;
+      a[n+3] = 5;
+
+
+      Scalar temp1 = a[2];
+      Scalar temp2 = a[n];
+      Scalar temp3 = a[n+1];
+
+      a.assign(n+2,-1);
+
+      a[2] = temp1;
+      a[n] = temp2;
+      a[n+1] = temp3;
+
+      Scalar test1 = 0;
+      for(unsigned int i=0; i<a.size(); i++)
+        test1+=a[i];
+
+      a.assign(n+1,-2);
+      Scalar test2 = 0;
+      for(unsigned int i=0; i<a.size(); i++)
+        test2+=a[i];
+
+      a.reserve(n+10);
+
+      Scalar test3 = 0;
+      for(unsigned int i=0; i<a.size(); i++)
+        test3+=a[i];
+
+
+      return (test1*test2+test3)*test2+test1*test3;
+    }
+
+
+    test_vector_combinations(unsigned int size)
+    {
+      reference = run_me<std::vector<Scalar> >(size);
+      result = run_me<Kokkos::vector<Scalar,Device> >(size);
+    }
+
+   };
+
+} // namespace Impl
+
+
+
+
+template <typename Scalar, typename Device>
+void test_vector_combinations(unsigned int size)
+{
+  Impl::test_vector_combinations<Scalar,Device> test(size);
+  ASSERT_EQ( test.reference, test.result);
+}
+
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
+
diff --git a/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp b/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9451e3551b85ebe1093cfe3b6c6f9f2f37aec355
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
@@ -0,0 +1,213 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynRankView.hpp>
+
+#include <type_traits>
+#include <typeinfo>
+
+namespace Test {
+
+namespace {
+
+template <typename ExecSpace >
+struct TestViewCtorProp_EmbeddedDim {
+
+  using ViewIntType     = typename Kokkos::View< int**, ExecSpace >;
+  using ViewDoubleType     = typename Kokkos::View< double*, ExecSpace >;
+
+  using DynRankViewIntType     = typename Kokkos::DynRankView< int, ExecSpace >;
+  using DynRankViewDoubleType     = typename Kokkos::DynRankView< double, ExecSpace >;
+
+  // Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor
+  template < class ViewType >
+  struct Functor {
+
+    ViewType v;
+
+    Functor( const ViewType & v_ ) : v(v_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int i ) const {
+      v(i) = i;
+    }
+
+  };
+
+
+  static void test_vcpt( const int N0, const int N1 )
+  {
+
+    // Create two views to test
+    {
+      using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ;
+      using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ;
+
+      VIT vi1("vi1", N0, N1);
+      VDT vd1("vd1", N0);
+
+      // TEST: Test for common type between two views, one with type double, other with type int
+      // Deduce common value_type and construct a view with that type
+      {
+        // Two views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
+      #if 0
+      // debug output
+      for ( int i = 0; i < N0*N1; ++i ) {
+        printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) );
+      }
+
+      printf( " Common value type view: %s \n", typeid( CVT() ).name() );
+      printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() );
+      if ( std::is_same< CommonViewValueType, double >::value == true ) {
+        printf("Proper common value_type\n");
+      }
+      else {
+        printf("WRONG common value_type\n");
+      }
+      // end debug output
+      #endif
+      }
+
+      {
+        // Single view
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
+      }
+
+    }
+
+    // Create two dynamic rank views to test
+    {
+      using VIT = typename TestViewCtorProp_EmbeddedDim::DynRankViewIntType ;
+      using VDT = typename TestViewCtorProp_EmbeddedDim::DynRankViewDoubleType ;
+
+      VIT vi1("vi1", N0, N1);
+      VDT vd1("vd1", N0);
+
+      // TEST: Test for common type between two views, one with type double, other with type int
+      // Deduce common value_type and construct a view with that type
+      {
+        // Two views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1, vd1 );
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
+      }
+
+      {
+        // Single views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1 );
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
+      }
+    }
+
+
+  } // end test_vcpt
+
+}; // end struct
+
+} // namespace
+
+} // namespace Test
diff --git a/packages/kokkos/containers/unit_tests/UnitTestMain.cpp b/packages/kokkos/containers/unit_tests/UnitTestMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..508b43efd809e7041cd773b5cdaa25dc92e71450
--- /dev/null
+++ b/packages/kokkos/containers/unit_tests/UnitTestMain.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <cstdlib>
+#include <Kokkos_Core.hpp>
+
+int main(int argc, char *argv[]) {
+  Kokkos::initialize(argc,argv);
+  ::testing::InitGoogleTest(&argc,argv);
+  int result = RUN_ALL_TESTS();
+  Kokkos::finalize();
+  return result;
+}
+
diff --git a/packages/kokkos/core/CMakeLists.txt b/packages/kokkos/core/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..93db0d2ecf916f6ca1451f0794d31049d5477498
--- /dev/null
+++ b/packages/kokkos/core/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+
+TRIBITS_SUBPACKAGE(Core)
+
+IF(KOKKOS_HAS_TRILINOS)
+  ADD_SUBDIRECTORY(src)
+ENDIF()
+
+TRIBITS_ADD_TEST_DIRECTORIES(unit_test)
+TRIBITS_ADD_TEST_DIRECTORIES(perf_test)
+
+TRIBITS_SUBPACKAGE_POSTPROCESS()
+
diff --git a/packages/kokkos/core/cmake/Dependencies.cmake b/packages/kokkos/core/cmake/Dependencies.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8d9872725e59655f256a9e62bf3f706a79e80e59
--- /dev/null
+++ b/packages/kokkos/core/cmake/Dependencies.cmake
@@ -0,0 +1,6 @@
+TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib
+  TEST_OPTIONAL_TPLS CUSPARSE
+  )
+
+TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib)
diff --git a/packages/kokkos/core/cmake/KokkosCore_config.h.in b/packages/kokkos/core/cmake/KokkosCore_config.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..599c6b022409f1446cda3b15b985f080171e57b4
--- /dev/null
+++ b/packages/kokkos/core/cmake/KokkosCore_config.h.in
@@ -0,0 +1,104 @@
+/* The trivial 'src/build_common.sh' creates a config
+ * that must stay in sync with this file.
+ */
+#cmakedefine KOKKOS_FOR_SIERRA
+
+#if !defined(KOKKOS_FOR_SIERRA)
+
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+
+#cmakedefine KOKKOS_HAVE_CUDA
+#cmakedefine KOKKOS_HAVE_OPENMP
+#cmakedefine KOKKOS_HAVE_PTHREAD
+#cmakedefine KOKKOS_HAVE_QTHREADS
+#cmakedefine KOKKOS_HAVE_SERIAL
+#cmakedefine KOKKOS_HAVE_Winthread
+
+#cmakedefine KOKKOS_HAVE_HWLOC
+#cmakedefine KOKKOS_ENABLE_HBWSPACE
+#cmakedefine KOKKOS_ENABLE_LIBRT
+
+#cmakedefine KOKKOS_HAVE_DEBUG
+#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
+#cmakedefine KOKKOS_ENABLE_PROFILING
+#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
+
+#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
+
+#ifdef KOKKOS_HAVE_CUDA
+
+#cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+
+// mfh 16 Sep 2014: If passed in on the command line, that overrides
+// any value of KOKKOS_USE_CUDA_UVM here.  Doing this should prevent build
+// warnings like this one:
+//
+// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning: "KOKKOS_USE_CUDA_UVM" redefined
+//
+// At some point, we should edit the test-build scripts in
+// Trilinos/cmake/ctest/drivers/perseus/, and take
+// -DKOKKOS_USE_CUDA_UVM from the command-line arguments there.  I
+// hesitate to do that now, because I'm not sure if all the files are
+// including KokkosCore_config.h (or a header file that includes it) like
+// they should.
+#ifndef KOKKOS_USE_CUDA_UVM
+#cmakedefine KOKKOS_USE_CUDA_UVM
+#endif
+
+#cmakedefine KOKKOS_HAVE_CUDA_RDC
+#ifdef KOKKOS_HAVE_CUDA_RDC
+#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1
+#endif
+
+#cmakedefine KOKKOS_HAVE_CUDA_LAMBDA
+#ifdef KOKKOS_HAVE_CUDA_LAMBDA
+#define KOKKOS_CUDA_USE_LAMBDA 1
+#endif
+
+#endif
+
+#cmakedefine KOKKOS_CUDA_CLANG_WORKAROUND
+
+#ifndef __CUDA_ARCH__
+#cmakedefine KOKKOS_ENABLE_ISA_X86_64
+#cmakedefine KOKKOS_ENABLE_ISA_KNC
+#cmakedefine KOKKOS_ENABLE_ISA_POWERPCLE
+#endif
+
+#cmakedefine KOKKOS_ARCH_ARMV80 1
+#cmakedefine KOKKOS_ARCH_ARMV81 1
+#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX 1
+#cmakedefine KOKKOS_ARCH_AVX 1
+#cmakedefine KOKKOS_ARCH_AVX2 1
+#cmakedefine KOKKOS_ARCH_AVX512MIC 1
+#cmakedefine KOKKOS_ARCH_AVX512XEON 1
+#cmakedefine KOKKOS_ARCH_KNC 1
+#cmakedefine KOKKOS_ARCH_POWER8 1
+#cmakedefine KOKKOS_ARCH_POWER9 1
+#cmakedefine KOKKOS_ARCH_KEPLER 1
+#cmakedefine KOKKOS_ARCH_KEPLER30 1
+#cmakedefine KOKKOS_ARCH_KEPLER32 1
+#cmakedefine KOKKOS_ARCH_KEPLER35 1
+#cmakedefine KOKKOS_ARCH_KEPLER37 1
+#cmakedefine KOKKOS_ARCH_MAXWELL 1
+#cmakedefine KOKKOS_ARCH_MAXWELL50 1
+#cmakedefine KOKKOS_ARCH_MAXWELL52 1
+#cmakedefine KOKKOS_ARCH_MAXWELL53 1
+#cmakedefine KOKKOS_ARCH_PASCAL 1
+#cmakedefine KOKKOS_ARCH_PASCAL60 1
+#cmakedefine KOKKOS_ARCH_PASCAL61 1
+
+// TODO: These are currently not used in Kokkos.  Should they be removed?
+#cmakedefine KOKKOS_HAVE_MPI
+#cmakedefine KOKKOS_HAVE_CUSPARSE
+
+// TODO: No longer options in Kokkos.  Need to be removed.
+#cmakedefine KOKKOS_USING_DEPRECATED_VIEW
+#cmakedefine KOKKOS_HAVE_CXX11
+
+#endif // !defined(KOKKOS_FOR_SIERRA)
diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..84c49a7713ba9b163c9b5b67e312367bdcd2b4aa
--- /dev/null
+++ b/packages/kokkos/core/perf_test/CMakeLists.txt
@@ -0,0 +1,44 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
+
+IF(NOT KOKKOS_HAS_TRILINOS)
+  IF(KOKKOS_SEPARATE_LIBS)
+    set(TEST_LINK_TARGETS kokkoscore)
+  ELSE()
+    set(TEST_LINK_TARGETS kokkos)
+  ENDIF()
+ENDIF()
+
+# warning: PerfTest_CustomReduction.cpp uses
+# ../../algorithms/src/Kokkos_Random.hpp
+# we'll just allow it to be included, but note
+# that in TriBITS KokkosAlgorithms can be disabled...
+INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
+
+SET(SOURCES
+  PerfTestMain.cpp
+  PerfTestGramSchmidt.cpp
+  PerfTestHexGrad.cpp
+  PerfTest_CustomReduction.cpp
+  )
+
+# Per #374, we always want to build this test, but we only want to run
+# it as a PERFORMANCE test.  That's why we separate building the test
+# from running the test.
+
+TRIBITS_ADD_EXECUTABLE(
+  PerfTestExec
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+
+TRIBITS_ADD_TEST(
+  PerfTest
+  NAME PerfTestExec
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  CATEGORIES PERFORMANCE
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  )
diff --git a/packages/kokkos/core/perf_test/Makefile b/packages/kokkos/core/perf_test/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..a1ec61ba797a0ef0073cc284944996b68bc8eae0
--- /dev/null
+++ b/packages/kokkos/core/perf_test/Makefile
@@ -0,0 +1,100 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../tpls/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/core/perf_test
+
+default: build_all
+	echo "End Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+  KOKKOS_CUDA_OPTIONS=enable_lambda
+else
+  CXX = g++
+endif
+
+CXXFLAGS = -O3 
+#CXXFLAGS += -DGENERIC_REDUCER
+LINK ?= $(CXX)
+LDFLAGS ?=
+override LDFLAGS += -lpthread
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/perf_test
+
+TEST_TARGETS =
+TARGETS =
+
+#
+
+OBJ_PERF = PerfTestMain.o gtest-all.o
+OBJ_PERF += PerfTestGramSchmidt.o
+OBJ_PERF += PerfTestHexGrad.o
+OBJ_PERF += PerfTest_CustomReduction.o
+OBJ_PERF += PerfTest_ViewCopy.o
+TARGETS += KokkosCore_PerformanceTest
+TEST_TARGETS += test-performance
+
+#
+
+OBJ_ATOMICS = test_atomic.o 
+TARGETS += KokkosCore_PerformanceTest_Atomics
+TEST_TARGETS += test-atomic
+
+#
+
+ifneq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+OBJ_MEMPOOL = test_mempool.o 
+TARGETS += KokkosCore_PerformanceTest_Mempool
+TEST_TARGETS += test-mempool
+
+#
+
+OBJ_TASKDAG = test_taskdag.o 
+TARGETS += KokkosCore_PerformanceTest_TaskDAG
+TEST_TARGETS += test-taskdag
+endif
+
+#
+
+KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest
+
+KokkosCore_PerformanceTest_Atomics: $(OBJ_ATOMICS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_ATOMICS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest_Atomics
+
+KokkosCore_PerformanceTest_Mempool: $(OBJ_MEMPOOL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_MEMPOOL) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_Mempool
+
+KokkosCore_PerformanceTest_TaskDAG: $(OBJ_TASKDAG) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_TASKDAG) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_TaskDAG
+
+test-performance: KokkosCore_PerformanceTest
+	./KokkosCore_PerformanceTest
+
+test-atomic: KokkosCore_PerformanceTest_Atomics
+	./KokkosCore_PerformanceTest_Atomics
+
+test-mempool: KokkosCore_PerformanceTest_Mempool
+	./KokkosCore_PerformanceTest_Mempool
+
+test-taskdag: KokkosCore_PerformanceTest_TaskDAG
+	./KokkosCore_PerformanceTest_TaskDAG
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
diff --git a/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp b/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb2fb5fce5fa339efeea145cffc822fefd1b2055
--- /dev/null
+++ b/packages/kokkos/core/perf_test/PerfTestBlasKernels.hpp
@@ -0,0 +1,309 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BLAS_KERNELS_HPP
+#define KOKKOS_BLAS_KERNELS_HPP
+
+namespace Kokkos {
+
+template< class ConstVectorType ,
+          class Device = typename ConstVectorType::execution_space >
+struct Dot ;
+
+template< class ConstVectorType ,
+          class Device = typename ConstVectorType::execution_space >
+struct DotSingle ;
+
+template< class ConstScalarType ,
+          class VectorType ,
+          class Device = typename VectorType::execution_space >
+struct Scale ;
+
+template< class ConstScalarType ,
+          class ConstVectorType ,
+          class VectorType ,
+          class Device = typename VectorType::execution_space >
+struct AXPBY ;
+
+/** \brief  Y = alpha * X + beta * Y */
+template< class ConstScalarType ,
+          class ConstVectorType ,
+          class      VectorType >
+void axpby( const ConstScalarType & alpha ,
+            const ConstVectorType & X ,
+            const ConstScalarType & beta ,
+            const      VectorType & Y )
+{
+  typedef AXPBY< ConstScalarType , ConstVectorType , VectorType > functor ;
+
+  parallel_for( Y.extent(0) , functor( alpha , X , beta , Y ) );
+}
+
+/** \brief  Y *= alpha */
+template< class ConstScalarType ,
+          class      VectorType >
+void scale( const ConstScalarType & alpha , const VectorType & Y )
+{
+  typedef Scale< ConstScalarType , VectorType > functor ;
+
+  parallel_for( Y.extent(0) , functor( alpha , Y ) );
+}
+
+template< class ConstVectorType ,
+          class Finalize >
+void dot( const ConstVectorType & X ,
+          const ConstVectorType & Y ,
+          const Finalize & finalize )
+{
+  typedef Dot< ConstVectorType >  functor ;
+
+  parallel_reduce( X.extent(0) , functor( X , Y ) , finalize );
+}
+
+template< class ConstVectorType ,
+          class Finalize >
+void dot( const ConstVectorType & X ,
+          const Finalize & finalize )
+{
+  typedef DotSingle< ConstVectorType >  functor ;
+
+  parallel_reduce( X.extent(0) , functor( X ) , finalize );
+}
+
+} /* namespace Kokkos */
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class Type , class Device >
+struct Dot
+{
+  typedef typename Device::execution_space execution_space ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< Type::Rank > >::type ok_rank ;
+
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename Type::execution_space >::type ok_device ;*/
+
+  typedef double value_type ;
+
+#if 1
+  typename Type::const_type X ;
+  typename Type::const_type Y ;
+#else
+  Type X ;
+  Type Y ;
+#endif
+
+  Dot( const Type & arg_x , const Type & arg_y )
+    : X(arg_x) , Y(arg_y) { }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , value_type & update ) const
+    { update += X[i] * Y[i]; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+    { update += source; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+};
+
+template< class Type , class Device >
+struct DotSingle
+{
+  typedef typename Device::execution_space execution_space ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< Type::Rank > >::type ok_rank ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename Type::execution_space >::type ok_device ;*/
+
+  typedef double value_type ;
+
+#if 1
+  typename Type::const_type X ;
+#else
+  Type X ;
+#endif
+
+  DotSingle( const Type & arg_x ) : X(arg_x) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , value_type & update ) const
+    {
+      const typename Type::value_type & x = X[i]; update += x * x ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+    { update += source; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+};
+
+
+template< class ScalarType , class VectorType , class Device>
+struct Scale
+{
+  typedef typename Device::execution_space execution_space ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ScalarType::execution_space >::type
+      ok_scalar_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename VectorType::execution_space >::type
+      ok_vector_device ;*/
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
+                            Impl::unsigned_< ScalarType::Rank > >::type
+      ok_scalar_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< VectorType::Rank > >::type
+      ok_vector_rank ;
+
+#if 1
+  typename ScalarType::const_type alpha ;
+#else
+  ScalarType alpha ;
+#endif
+
+  VectorType Y ;
+
+  Scale( const ScalarType & arg_alpha , const VectorType & arg_Y )
+    : alpha( arg_alpha ), Y( arg_Y ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const
+    {
+      Y[i] *= alpha() ;
+    }
+};
+
+
+template< class ScalarType ,
+          class ConstVectorType ,
+          class VectorType,
+          class Device>
+struct AXPBY
+{
+  typedef typename Device::execution_space execution_space ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ScalarType::execution_space >::type
+      ok_scalar_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ConstVectorType::execution_space >::type
+      ok_const_vector_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename VectorType::execution_space >::type
+      ok_vector_device ;*/
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
+                            Impl::unsigned_< ScalarType::Rank > >::type
+      ok_scalar_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< ConstVectorType::Rank > >::type
+      ok_const_vector_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< VectorType::Rank > >::type
+      ok_vector_rank ;
+
+#if 1
+  typename ScalarType::const_type alpha , beta ;
+  typename ConstVectorType::const_type X ;
+#else
+  ScalarType alpha , beta ;
+  ConstVectorType X ;
+#endif
+
+  VectorType Y ;
+
+  AXPBY( const ScalarType      & arg_alpha ,
+         const ConstVectorType & arg_X ,
+         const ScalarType      & arg_beta ,
+         const VectorType      & arg_Y )
+    : alpha( arg_alpha ), beta( arg_beta ), X( arg_X ), Y( arg_Y ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const
+    {
+      Y[i] = alpha() * X[i] + beta() * Y[i] ;
+    }
+};
+
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_BLAS_KERNELS_HPP */
diff --git a/packages/kokkos/core/perf_test/PerfTestDriver.hpp b/packages/kokkos/core/perf_test/PerfTestDriver.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9cd779edf4225ef6402ebb23962a7b2f82d791ef
--- /dev/null
+++ b/packages/kokkos/core/perf_test/PerfTestDriver.hpp
@@ -0,0 +1,402 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <string>
+
+// mfh 06 Jun 2013: This macro doesn't work like one might thing it
+// should.  It doesn't take the template parameter DeviceType and
+// print its actual type name; it just literally prints out
+// "DeviceType".  I've worked around this below without using the
+// macro, so I'm commenting out the macro to avoid compiler complaints
+// about an unused macro.
+
+// #define KOKKOS_IMPL_MACRO_TO_STRING( X ) #X
+// #define KOKKOS_MACRO_TO_STRING( X )  KOKKOS_IMPL_MACRO_TO_STRING( X )
+
+//------------------------------------------------------------------------
+
+namespace Test {
+
+enum { NUMBER_OF_TRIALS = 5 };
+
+template< class DeviceType , class LayoutType >
+void run_test_mdrange( int exp_beg , int exp_end, const char deviceTypeName[], int range_offset = 0,  int tile_offset = 0 )
+// exp_beg = 6 => 2^6 = 64 is starting range length
+{
+#define MDRANGE_PERFORMANCE_OUTPUT_VERBOSE 0
+
+  std::string label_mdrange ;
+  label_mdrange.append( "\"MDRange< double , " );
+  label_mdrange.append( deviceTypeName );
+  label_mdrange.append( " >\"" );
+
+  std::string label_range_col2 ;
+  label_range_col2.append( "\"RangeColTwo< double , " );
+  label_range_col2.append( deviceTypeName );
+  label_range_col2.append( " >\"" );
+
+  std::string label_range_col_all ;
+  label_range_col_all.append( "\"RangeColAll< double , " );
+  label_range_col_all.append( deviceTypeName );
+  label_range_col_all.append( " >\"" );
+
+  if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value) {
+    std::cout << "--------------------------------------------------------------\n"
+      << "Performance tests for MDRange Layout Right"
+      << "\n--------------------------------------------------------------" << std::endl;
+  } else {
+    std::cout << "--------------------------------------------------------------\n"
+      << "Performance tests for MDRange Layout Left"
+      << "\n--------------------------------------------------------------" << std::endl;
+  }
+
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    const int range_length = (1<<i) + range_offset;
+
+    std::cout << "\n--------------------------------------------------------------\n"
+      << "--------------------------------------------------------------\n"
+      << "MDRange Test:  range bounds: " << range_length << " , " << range_length << " , " << range_length 
+      << "\n--------------------------------------------------------------\n"
+      << "--------------------------------------------------------------\n";
+//      << std::endl;
+
+    int t0_min = 0, t1_min = 0, t2_min = 0;
+    double seconds_min = 0.0;
+
+    // Test 1: The MDRange in full
+    {
+    int t0 = 1, t1 = 1, t2 = 1;
+    int counter = 1;
+#if !defined(KOKKOS_HAVE_CUDA)
+    int min_bnd = 8;
+    int tfast = range_length;
+#else
+    int min_bnd = 2;
+    int tfast = 32;
+#endif
+    while ( tfast >= min_bnd ) {
+      int tmid = min_bnd;
+      while ( tmid < tfast ) { 
+        t0 = min_bnd;
+        t1 = tmid;
+        t2 = tfast;
+        int t2_rev = min_bnd;
+        int t1_rev = tmid;
+        int t0_rev = tfast;
+
+#if defined(KOKKOS_HAVE_CUDA)
+        //Note: Product of tile sizes must be < 1024 for Cuda
+        if ( t0*t1*t2 >= 1024 ) {
+          printf("  Exceeded Cuda tile limits; onto next range set\n\n");
+          break;
+        }
+#endif
+
+        // Run 1 with tiles LayoutRight style
+        double seconds_1 = 0;
+        { seconds_1 = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0, t1, t2) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+        std::cout << label_mdrange
+          << " , " << t0 << " , " << t1 << " , " << t2
+          << " , " << seconds_1
+          << std::endl ;
+#endif
+
+        if ( counter == 1 ) {
+          seconds_min = seconds_1;
+          t0_min = t0;
+          t1_min = t1;
+          t2_min = t2;
+        } 
+        else {
+          if ( seconds_1 < seconds_min ) 
+          { 
+            seconds_min = seconds_1; 
+            t0_min = t0;
+            t1_min = t1;
+            t2_min = t2;
+          }
+        }
+
+        // Run 2 with tiles LayoutLeft style - reverse order of tile dims
+        double seconds_1rev = 0;
+        { seconds_1rev = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0_rev, t1_rev, t2_rev) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+        std::cout << label_mdrange
+          << " , " << t0_rev << " , " << t1_rev << " , " << t2_rev
+          << " , " << seconds_1rev
+          << std::endl ;
+#endif
+
+        if ( seconds_1rev < seconds_min ) 
+        { 
+          seconds_min = seconds_1rev; 
+          t0_min = t0_rev;
+          t1_min = t1_rev;
+          t2_min = t2_rev;
+        }
+
+        ++counter;
+        tmid <<= 1;
+      } //end inner while
+      tfast >>=1;
+    } //end outer while
+
+    std::cout << "\n"
+      << "--------------------------------------------------------------\n"
+      << label_mdrange
+      << "\n Min values "
+      << "\n Range length per dim (3D): " << range_length
+      << "\n TileDims:  " << t0_min << " , " << t1_min << " , " << t2_min
+      << "\n Min time: " << seconds_min
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+    } //end scope
+
+#if !defined(KOKKOS_HAVE_CUDA)
+  double seconds_min_c = 0.0;
+  int t0c_min = 0, t1c_min = 0, t2c_min = 0;
+  int counter = 1;
+  {
+    int min_bnd = 8;
+    // Test 1_c: MDRange with 0 for 'inner' tile dim; this case will utilize the full span in that direction, should be similar to Collapse<2>
+    if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value ) {
+      for ( unsigned int T0 = min_bnd; T0 < static_cast<unsigned int>(range_length); T0<<=1 ) {
+        for ( unsigned int T1 = min_bnd; T1 < static_cast<unsigned int>(range_length); T1<<=1 ) {
+          double seconds_c = 0;
+          { seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, T0, T1, 0) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+          std::cout << " MDRange LR with '0' tile - collapse-like \n"
+          << label_mdrange
+          << " , " << T0 << " , " << T1 << " , " << range_length
+          << " , " << seconds_c
+          << std::endl ;
+#endif
+
+          t2c_min = range_length;
+          if ( counter == 1 ) {
+            seconds_min_c = seconds_c;
+            t0c_min = T0;
+            t1c_min = T1;
+          } 
+          else {
+            if ( seconds_c < seconds_min_c ) 
+            { 
+              seconds_min_c = seconds_c; 
+              t0c_min = T0;
+              t1c_min = T1;
+            }
+          }
+          ++counter;
+        }
+      }
+    }
+    else {
+      for ( unsigned int T1 = min_bnd; T1 <= static_cast<unsigned int>(range_length); T1<<=1 ) {
+        for ( unsigned int T2 = min_bnd; T2 <= static_cast<unsigned int>(range_length); T2<<=1 ) {
+          double seconds_c = 0;
+          { seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, 0, T1, T2) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+          std::cout << " MDRange LL with '0' tile - collapse-like \n"
+          << label_mdrange
+          << " , " <<range_length << " < " << T1 << " , " << T2
+          << " , " << seconds_c
+          << std::endl ;
+#endif
+
+
+          t0c_min = range_length;
+          if ( counter == 1 ) {
+            seconds_min_c = seconds_c;
+            t1c_min = T1;
+            t2c_min = T2;
+          } 
+          else {
+            if ( seconds_c < seconds_min_c ) 
+            { 
+              seconds_min_c = seconds_c; 
+              t1c_min = T1;
+              t2c_min = T2;
+            }
+          }
+          ++counter;
+        }
+      }
+    }
+
+    std::cout 
+//      << "--------------------------------------------------------------\n"
+      << label_mdrange
+      << "  Collapse<2> style: "
+      << "\n Min values "
+      << "\n Range length per dim (3D): " << range_length
+      << "\n TileDims:  " << t0c_min << " , " << t1c_min << " , " << t2c_min
+      << "\n Min time: " << seconds_min_c
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+  } //end scope test 2
+#endif
+
+
+    // Test 2: RangePolicy Collapse2 style
+    double seconds_2 = 0;
+    { seconds_2 = RangePolicyCollapseTwo< DeviceType , double , LayoutType >::test_index_collapse_two(range_length,range_length,range_length) ; }
+    std::cout << label_range_col2
+      << " , " << range_length
+      << " , " << seconds_2
+      << std::endl ;
+
+
+    // Test 3: RangePolicy Collapse all style - not necessary, always slow
+    /*
+    double seconds_3 = 0;
+    { seconds_3 = RangePolicyCollapseAll< DeviceType , double , LayoutType >::test_collapse_all(range_length,range_length,range_length) ; }
+    std::cout << label_range_col_all
+      << " , " << range_length
+      << " , " << seconds_3
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+    */
+
+    // Compare fastest times... will never be collapse all so ignore it
+    // seconds_min = tiled MDRange
+    // seconds_min_c = collapse<2>-like MDRange (tiledim = span for fast dim) - only for non-Cuda, else tile too long
+    // seconds_2 = collapse<2>-style RangePolicy
+    // seconds_3 = collapse<3>-style RangePolicy
+
+#if !defined(KOKKOS_HAVE_CUDA)
+    if ( seconds_min < seconds_min_c ) {
+      if ( seconds_min < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange tiled\n"
+          << " Time: " << seconds_min
+          << " Difference: " << seconds_2 - seconds_min
+          << " Other times: \n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+    }
+    else if ( seconds_min > seconds_min_c ) {
+      if ( seconds_min_c < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange collapse-like (tiledim = span on fast dim) type\n"
+          << " Time: " << seconds_min_c
+          << " Difference: " << seconds_2 - seconds_min_c
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min_c > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min_c - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+    } // end else if
+#else
+      if ( seconds_min < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange tiled\n"
+          << " Time: " << seconds_min
+          << " Difference: " << seconds_2 - seconds_min
+          << " Other times: \n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+#endif
+
+  } //end for
+
+#undef MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+
+}
+
+
+}
+
diff --git a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b169b02903ed4589bace5f83ee880937bba328b8
--- /dev/null
+++ b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
@@ -0,0 +1,283 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <gtest/gtest.h>
+#include <PerfTest_Category.hpp>
+
+#include <cmath>
+#include <PerfTestBlasKernels.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+// Reduction   : result = dot( Q(:,j) , Q(:,j) );
+// PostProcess : R(j,j) = result ; inv = 1 / result ;
+template< class VectorView , class ValueView  >
+struct InvNorm2 : public Kokkos::DotSingle< VectorView > {
+
+  typedef typename Kokkos::DotSingle< VectorView >::value_type value_type ;
+
+  ValueView  Rjj ;
+  ValueView  inv ;
+
+  InvNorm2( const VectorView & argX ,
+            const ValueView  & argR ,
+            const ValueView  & argInv )
+    : Kokkos::DotSingle< VectorView >( argX )
+    , Rjj( argR )
+    , inv( argInv )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & result ) const
+  {
+    result = std::sqrt( result );
+    Rjj() = result ;
+    inv() = ( 0 < result ) ? 1.0 / result : 0 ;
+  }
+};
+
+template< class VectorView , class ValueView >
+inline
+void invnorm2( const VectorView & x ,
+               const ValueView  & r ,
+               const ValueView  & r_inv )
+{
+  Kokkos::parallel_reduce( x.extent(0) , InvNorm2< VectorView , ValueView >( x , r , r_inv ) );
+}
+
+// PostProcess : tmp = - ( R(j,k) = result );
+template< class VectorView , class ValueView  >
+struct DotM : public Kokkos::Dot< VectorView > {
+
+  typedef typename Kokkos::Dot< VectorView >::value_type value_type ;
+
+  ValueView  Rjk ;
+  ValueView  tmp ;
+
+  DotM( const VectorView & argX ,
+        const VectorView & argY ,
+        const ValueView & argR ,
+        const ValueView & argTmp )
+    : Kokkos::Dot< VectorView >( argX , argY )
+    , Rjk( argR )
+    , tmp( argTmp )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & result ) const
+  {
+     Rjk()  = result ;
+     tmp()  = - result ;
+  }
+};
+
+template< class VectorView , class ValueView >
+inline
+void dot_neg( const VectorView & x ,
+              const VectorView & y ,
+              const ValueView  & r ,
+              const ValueView  & r_neg )
+{
+  Kokkos::parallel_reduce( x.extent(0) , DotM< VectorView , ValueView >( x , y , r , r_neg ) );
+}
+
+
+template< typename Scalar , class DeviceType >
+struct ModifiedGramSchmidt
+{
+  typedef DeviceType  execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef Kokkos::View< Scalar** ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > multivector_type ;
+
+  typedef Kokkos::View< Scalar* ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > vector_type ;
+
+  typedef Kokkos::View< Scalar ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > value_view ;
+
+
+  multivector_type Q ;
+  multivector_type R ;
+
+  static double factorization( const multivector_type Q_ ,
+                               const multivector_type R_ )
+  {
+    const size_type count  = Q_.extent(1);
+    value_view tmp("tmp");
+    value_view one("one");
+
+    Kokkos::deep_copy( one , (Scalar) 1 );
+
+    Kokkos::Timer timer ;
+
+    for ( size_type j = 0 ; j < count ; ++j ) {
+      // Reduction   : tmp = dot( Q(:,j) , Q(:,j) );
+      // PostProcess : tmp = std::sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ;
+      const vector_type Qj  = Kokkos::subview( Q_ , Kokkos::ALL() , j );
+      const value_view  Rjj = Kokkos::subview( R_ , j , j );
+
+      invnorm2( Qj , Rjj , tmp );
+
+      // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ;
+      Kokkos::scale( tmp , Qj );
+
+      for ( size_t k = j + 1 ; k < count ; ++k ) {
+        const vector_type Qk = Kokkos::subview( Q_ , Kokkos::ALL() , k );
+        const value_view  Rjk = Kokkos::subview( R_ , j , k );
+
+        // Reduction   : R(j,k) = dot( Q(:,j) , Q(:,k) );
+        // PostProcess : tmp = - R(j,k);
+        dot_neg( Qj , Qk , Rjk , tmp );
+
+        // Q(:,k) -= R(j,k) * Q(:,j); => Q(:,k) += tmp * Q(:,j)
+        Kokkos::axpby( tmp , Qj , one , Qk );
+      }
+    }
+
+    execution_space::fence();
+
+    return timer.seconds();
+  }
+
+  //--------------------------------------------------------------------------
+
+  static double test( const size_t length ,
+                      const size_t count ,
+                      const size_t iter = 1 )
+  {
+    multivector_type Q_( "Q" , length , count );
+    multivector_type R_( "R" , count , count );
+
+    typename multivector_type::HostMirror A =
+      Kokkos::create_mirror( Q_ );
+
+    // Create and fill A on the host
+
+    for ( size_type j = 0 ; j < count ; ++j ) {
+      for ( size_type i = 0 ; i < length ; ++i ) {
+        A(i,j) = ( i + 1 ) * ( j + 1 );
+      }
+    }
+
+    double dt_min = 0 ;
+
+    for ( size_t i = 0 ; i < iter ; ++i ) {
+
+      Kokkos::deep_copy( Q_ , A );
+
+      // A = Q * R
+
+      const double dt = factorization( Q_ , R_ );
+
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+    }
+
+    return dt_min ;
+  }
+};
+
+template< class DeviceType >
+void run_test_gramschmidt( int exp_beg , int exp_end, int num_trials, const char deviceTypeName[] )
+{
+  std::string label_gramschmidt ;
+  label_gramschmidt.append( "\"GramSchmidt< double , " );
+  label_gramschmidt.append( deviceTypeName );
+  label_gramschmidt.append( " >\"" );
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    double min_seconds = 0.0 ;
+    double max_seconds = 0.0 ;
+    double avg_seconds = 0.0 ;
+
+    const int parallel_work_length = 1<<i;
+
+    for ( int j = 0 ; j < num_trials ; ++j ) {
+      const double seconds = ModifiedGramSchmidt< double , DeviceType >::test(parallel_work_length, 32 ) ;
+
+      if ( 0 == j ) {
+        min_seconds = seconds ;
+        max_seconds = seconds ;
+      }
+      else {
+        if ( seconds < min_seconds ) min_seconds = seconds ;
+        if ( seconds > max_seconds ) max_seconds = seconds ;
+      }
+      avg_seconds += seconds ;
+    }
+    avg_seconds /= num_trials ;
+
+    std::cout << label_gramschmidt
+      << " , " << parallel_work_length
+      << " , " << min_seconds
+      << " , " << ( min_seconds / parallel_work_length )
+      << std::endl ;
+  }
+}
+
+TEST_F( default_exec, gramschmidt ) {
+  int exp_beg = 10;
+  int exp_end = 20;
+  int num_trials = 5;
+
+  if(command_line_num_args()>1)
+    exp_beg = atoi(command_line_arg(1));
+  if(command_line_num_args()>2)
+    exp_end = atoi(command_line_arg(2));
+  if(command_line_num_args()>3)
+    num_trials = atoi(command_line_arg(3));
+
+  EXPECT_NO_THROW(run_test_gramschmidt< Kokkos::DefaultExecutionSpace>( exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name()  ));
+}
+
+}
+
diff --git a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b228dd2e2eb604cd7b23bb51282a248af75406dc
--- /dev/null
+++ b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp
@@ -0,0 +1,325 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <gtest/gtest.h>
+#include <PerfTest_Category.hpp>
+
+namespace Test {
+
+template< class DeviceType ,
+          typename CoordScalarType = double ,
+          typename GradScalarType  = float >
+struct HexGrad
+{
+  typedef DeviceType execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef HexGrad<DeviceType,CoordScalarType,GradScalarType> self_type;
+
+  // 3D array : ( ParallelWork , Space , Node )
+
+  enum { NSpace = 3 , NNode = 8 };
+
+  typedef Kokkos::View< CoordScalarType*[NSpace][NNode] , execution_space >
+    elem_coord_type ;
+
+  typedef Kokkos::View< GradScalarType*[NSpace][NNode] , execution_space >
+    elem_grad_type ;
+
+  elem_coord_type  coords ;
+  elem_grad_type   grad_op ;
+
+  enum { FLOPS  = 318 }; // = 3 * ( 18 + 8 * 11 ) };
+  enum { READS  = 18 };
+  enum { WRITES = 18 };
+
+  HexGrad( const elem_coord_type  & arg_coords ,
+           const elem_grad_type   & arg_grad_op )
+    : coords( arg_coords )
+    , grad_op( arg_grad_op )
+    {}
+
+  KOKKOS_INLINE_FUNCTION static
+  void grad( const CoordScalarType x[] ,
+             const CoordScalarType z[] ,
+                   GradScalarType grad_y[] )
+  {
+    const GradScalarType R42=(x[3] - x[1]);
+    const GradScalarType R52=(x[4] - x[1]);
+    const GradScalarType R54=(x[4] - x[3]);
+
+    const GradScalarType R63=(x[5] - x[2]);
+    const GradScalarType R83=(x[7] - x[2]);
+    const GradScalarType R86=(x[7] - x[5]);
+
+    const GradScalarType R31=(x[2] - x[0]);
+    const GradScalarType R61=(x[5] - x[0]);
+    const GradScalarType R74=(x[6] - x[3]);
+
+    const GradScalarType R72=(x[6] - x[1]);
+    const GradScalarType R75=(x[6] - x[4]);
+    const GradScalarType R81=(x[7] - x[0]);
+
+    const GradScalarType t1=(R63 + R54);
+    const GradScalarType t2=(R61 + R74);
+    const GradScalarType t3=(R72 + R81);
+
+    const GradScalarType t4 =(R86 + R42);
+    const GradScalarType t5 =(R83 + R52);
+    const GradScalarType t6 =(R75 + R31);
+
+    //  Calculate Y gradient from X and Z data
+
+    grad_y[0] = (z[1] *  t1) - (z[2] * R42) - (z[3] *  t5)  + (z[4] *  t4) + (z[5] * R52) - (z[7] * R54);
+    grad_y[1] = (z[2] *  t2) + (z[3] * R31) - (z[0] *  t1)  - (z[5] *  t6) + (z[6] * R63) - (z[4] * R61);
+    grad_y[2] = (z[3] *  t3) + (z[0] * R42) - (z[1] *  t2)  - (z[6] *  t4) + (z[7] * R74) - (z[5] * R72);
+    grad_y[3] = (z[0] *  t5) - (z[1] * R31) - (z[2] *  t3)  + (z[7] *  t6) + (z[4] * R81) - (z[6] * R83);
+    grad_y[4] = (z[5] *  t3) + (z[6] * R86) - (z[7] *  t2)  - (z[0] *  t4) - (z[3] * R81) + (z[1] * R61);
+    grad_y[5] = (z[6] *  t5) - (z[4] *  t3)  - (z[7] * R75) + (z[1] *  t6) - (z[0] * R52) + (z[2] * R72);
+    grad_y[6] = (z[7] *  t1) - (z[5] *  t5)  - (z[4] * R86) + (z[2] *  t4) - (z[1] * R63) + (z[3] * R83);
+    grad_y[7] = (z[4] *  t2) - (z[6] *  t1)  + (z[5] * R75) - (z[3] *  t6) - (z[2] * R74) + (z[0] * R54);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type ielem ) const
+  {
+    GradScalarType g[NNode] ;
+
+    const CoordScalarType x[NNode] = {
+      coords(ielem,0,0),
+      coords(ielem,0,1),
+      coords(ielem,0,2),
+      coords(ielem,0,3),
+      coords(ielem,0,4),
+      coords(ielem,0,5),
+      coords(ielem,0,6),
+      coords(ielem,0,7)
+    };
+
+    const CoordScalarType y[NNode] = {
+      coords(ielem,1,0),
+      coords(ielem,1,1),
+      coords(ielem,1,2),
+      coords(ielem,1,3),
+      coords(ielem,1,4),
+      coords(ielem,1,5),
+      coords(ielem,1,6),
+      coords(ielem,1,7)
+    };
+
+    const CoordScalarType z[NNode] = {
+      coords(ielem,2,0),
+      coords(ielem,2,1),
+      coords(ielem,2,2),
+      coords(ielem,2,3),
+      coords(ielem,2,4),
+      coords(ielem,2,5),
+      coords(ielem,2,6),
+      coords(ielem,2,7)
+    };
+
+    grad( z , y , g );
+
+    grad_op(ielem,0,0) = g[0];
+    grad_op(ielem,0,1) = g[1];
+    grad_op(ielem,0,2) = g[2];
+    grad_op(ielem,0,3) = g[3];
+    grad_op(ielem,0,4) = g[4];
+    grad_op(ielem,0,5) = g[5];
+    grad_op(ielem,0,6) = g[6];
+    grad_op(ielem,0,7) = g[7];
+
+    grad( x , z , g );
+
+    grad_op(ielem,1,0) = g[0];
+    grad_op(ielem,1,1) = g[1];
+    grad_op(ielem,1,2) = g[2];
+    grad_op(ielem,1,3) = g[3];
+    grad_op(ielem,1,4) = g[4];
+    grad_op(ielem,1,5) = g[5];
+    grad_op(ielem,1,6) = g[6];
+    grad_op(ielem,1,7) = g[7];
+
+    grad( y , x , g );
+
+    grad_op(ielem,2,0) = g[0];
+    grad_op(ielem,2,1) = g[1];
+    grad_op(ielem,2,2) = g[2];
+    grad_op(ielem,2,3) = g[3];
+    grad_op(ielem,2,4) = g[4];
+    grad_op(ielem,2,5) = g[5];
+    grad_op(ielem,2,6) = g[6];
+    grad_op(ielem,2,7) = g[7];
+  }
+
+  //--------------------------------------------------------------------------
+
+  struct Init {
+    typedef typename self_type::execution_space execution_space ;
+
+    elem_coord_type coords ;
+
+    Init( const elem_coord_type & arg_coords )
+      : coords( arg_coords ) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( size_type ielem ) const
+    {
+      coords(ielem,0,0) = 0.;
+      coords(ielem,1,0) = 0.;
+      coords(ielem,2,0) = 0.;
+
+      coords(ielem,0,1) = 1.;
+      coords(ielem,1,1) = 0.;
+      coords(ielem,2,1) = 0.;
+
+      coords(ielem,0,2) = 1.;
+      coords(ielem,1,2) = 1.;
+      coords(ielem,2,2) = 0.;
+
+      coords(ielem,0,3) = 0.;
+      coords(ielem,1,3) = 1.;
+      coords(ielem,2,3) = 0.;
+
+
+      coords(ielem,0,4) = 0.;
+      coords(ielem,1,4) = 0.;
+      coords(ielem,2,4) = 1.;
+
+      coords(ielem,0,5) = 1.;
+      coords(ielem,1,5) = 0.;
+      coords(ielem,2,5) = 1.;
+
+      coords(ielem,0,6) = 1.;
+      coords(ielem,1,6) = 1.;
+      coords(ielem,2,6) = 1.;
+
+      coords(ielem,0,7) = 0.;
+      coords(ielem,1,7) = 1.;
+      coords(ielem,2,7) = 1.;
+    }
+  };
+
+  //--------------------------------------------------------------------------
+
+  static double test( const int count , const int iter = 1 )
+  {
+    elem_coord_type coord( "coord" , count );
+    elem_grad_type  grad ( "grad" , count );
+
+    // Execute the parallel kernels on the arrays:
+
+    double dt_min = 0 ;
+
+    Kokkos::parallel_for( count , Init( coord ) );
+    execution_space::fence();
+
+    for ( int i = 0 ; i < iter ; ++i ) {
+      Kokkos::Timer timer ;
+      Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+    }
+
+    return dt_min ;
+  }
+};
+
+template< class DeviceType >
+void run_test_hexgrad( int exp_beg , int exp_end, int num_trials, const char deviceTypeName[] )
+{
+  std::string label_hexgrad ;
+  label_hexgrad.append( "\"HexGrad< double , " );
+  label_hexgrad.append( deviceTypeName );
+  label_hexgrad.append( " >\"" );
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    double min_seconds = 0.0 ;
+    double max_seconds = 0.0 ;
+    double avg_seconds = 0.0 ;
+
+    const int parallel_work_length = 1<<i;
+
+    for ( int j = 0 ; j < num_trials ; ++j ) {
+      const double seconds = HexGrad< DeviceType >::test(parallel_work_length) ;
+
+      if ( 0 == j ) {
+        min_seconds = seconds ;
+        max_seconds = seconds ;
+      }
+      else {
+        if ( seconds < min_seconds ) min_seconds = seconds ;
+        if ( seconds > max_seconds ) max_seconds = seconds ;
+      }
+      avg_seconds += seconds ;
+    }
+    avg_seconds /= num_trials ;
+
+    std::cout << label_hexgrad
+      << " , " << parallel_work_length
+      << " , " << min_seconds
+      << " , " << ( min_seconds / parallel_work_length )
+      << std::endl ;
+  }
+}
+
+TEST_F( default_exec, hexgrad ) {
+  int exp_beg = 10;
+  int exp_end = 20;
+  int num_trials = 5;
+
+  if(command_line_num_args()>1)
+    exp_beg = atoi(command_line_arg(1));
+  if(command_line_num_args()>2)
+    exp_end = atoi(command_line_arg(2));
+  if(command_line_num_args()>3)
+    num_trials = atoi(command_line_arg(3));
+
+  EXPECT_NO_THROW(run_test_hexgrad< Kokkos::DefaultExecutionSpace >( exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name() ));
+}
+
+}
+
diff --git a/packages/kokkos/core/perf_test/PerfTestMDRange.hpp b/packages/kokkos/core/perf_test/PerfTestMDRange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d60539e279f218518f4122377db680581985f358
--- /dev/null
+++ b/packages/kokkos/core/perf_test/PerfTestMDRange.hpp
@@ -0,0 +1,564 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+namespace Test {
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct MultiDimRangePerf3D
+{
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+
+  using iterate_type = Kokkos::Experimental::Iterate;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  MultiDimRangePerf3D(const view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long i, const long j, const long k) const
+  {
+    A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                             + B(i,j+2,k) + B(i,j+1,k)
+                             + B(i,j,k+2) + B(i,j,k+1)
+                             + B(i,j,k) );
+  }
+
+
+  struct InitZeroTag {};
+//  struct InitViewTag {};
+
+  struct Init
+  {
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long i, const long j, const long k) const
+    {
+      input(i,j,k) = 1.0;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const InitZeroTag&, const long i, const long j, const long k) const
+    {
+      input(i,j,k) = 0;
+    }
+
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+  };
+
+
+  static double test_multi_index(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const unsigned int Ti = 1, const unsigned int Tj = 1, const unsigned int Tk = 1, const long iter = 1)
+  {
+    //This test performs multidim range over all dims
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef MultiDimRangePerf3D<execution_space,ScalarType,TestLayout> FunctorType;
+
+    double dt_min = 0;
+
+    // LayoutRight
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}}); 
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}}); 
+
+      typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > MDRangeType;
+      using tile_type = typename MDRangeType::tile_type;
+      using point_type = typename MDRangeType::point_type;
+
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
+
+      Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
+      execution_space::fence();
+      Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
+      execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - only the first run
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  multi Ahost = " << Ahost(l,j,k) << "  expected = " << check  
+                      << "  multi Bhost(ijk) = " << Bhost(l,j,k) 
+                      << "  multi Bhost(l+1jk) = " << Bhost(l+1,j,k) 
+                      << "  multi Bhost(l+2jk) = " << Bhost(l+2,j,k) 
+                      << "  multi Bhost(ij+1k) = " << Bhost(l,j+1,k) 
+                      << "  multi Bhost(ij+2k) = " << Bhost(l,j+2,k) 
+                      << "  multi Bhost(ijk+1) = " << Bhost(l,j,k+1) 
+                      << "  multi Bhost(ijk+2) = " << Bhost(l,j,k+2) 
+                      << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << "LR multi: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " multi: No errors!" <<  std::endl; }
+      }
+    } //end for
+
+    } 
+    // LayoutLeft
+    else {
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}}); 
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}}); 
+
+      //typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > MDRangeType;
+      //using tile_type = typename MDRangeType::tile_type;
+      //using point_type = typename MDRangeType::point_type;
+      //Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}} ); 
+
+      Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
+      execution_space::fence();
+      Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
+      execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - only the first run
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  multi Ahost = " << Ahost(l,j,k) << "  expected = " << check  
+                      << "  multi Bhost(ijk) = " << Bhost(l,j,k) 
+                      << "  multi Bhost(l+1jk) = " << Bhost(l+1,j,k) 
+                      << "  multi Bhost(l+2jk) = " << Bhost(l+2,j,k) 
+                      << "  multi Bhost(ij+1k) = " << Bhost(l,j+1,k) 
+                      << "  multi Bhost(ij+2k) = " << Bhost(l,j+2,k) 
+                      << "  multi Bhost(ijk+1) = " << Bhost(l,j,k+1) 
+                      << "  multi Bhost(ijk+2) = " << Bhost(l,j,k+2) 
+                      << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " LL multi run: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " multi: No errors!" <<  std::endl; }
+
+      }
+    } //end for
+    }
+
+    return dt_min;
+  } 
+
+};
+
+
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct RangePolicyCollapseTwo
+{
+  // RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for multi-dim; unroll 2 dims in one-dim
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+  typedef TestLayout layout;
+
+  using iterate_type = Kokkos::Experimental::Iterate;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  RangePolicyCollapseTwo(view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_) , irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long r) const
+  {
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+    {
+//id(i,j,k) = k + j*Nk + i*Nk*Nj = k + Nk*(j + i*Nj) = k + Nk*r
+//r = j + i*Nj
+      long i = int(r / jrange); 
+      long j = int( r - i*jrange);
+      for (int k = 0; k < krange; ++k) {
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                                 + B(i,j+2,k) + B(i,j+1,k)
+                                 + B(i,j,k+2) + B(i,j,k+1)
+                                 + B(i,j,k) );
+      }
+    }
+    else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+    {
+//id(i,j,k) = i + j*Ni + k*Ni*Nj = i + Ni*(j + k*Nj) = i + Ni*r
+//r = j + k*Nj
+      long k = int(r / jrange); 
+      long j = int( r - k*jrange);
+      for (int i = 0; i < irange; ++i) {
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                                 + B(i,j+2,k) + B(i,j+1,k)
+                                 + B(i,j,k+2) + B(i,j,k+1)
+                                 + B(i,j,k) );
+      }
+    }
+  }
+
+
+  struct Init
+  {
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long r) const
+    {
+      if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+      {
+        long i = int(r / jrange); 
+        long j = int( r - i*jrange);
+        for (int k = 0; k < krange; ++k) {
+          input(i,j,k) = 1;
+        }
+      }
+      else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+      {
+        long k = int(r / jrange); 
+        long j = int( r - k*jrange);
+        for (int i = 0; i < irange; ++i) {
+          input(i,j,k) = 1;
+        }
+      }
+    }
+  };
+
+
+  static double test_index_collapse_two(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
+  {
+    // This test refers to collapsing two dims while using the RangePolicy
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef RangePolicyCollapseTwo<execution_space,ScalarType,TestLayout> FunctorType;
+
+    long collapse_index_rangeA = 0;
+    long collapse_index_rangeB = 0;
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
+      collapse_index_rangeA = icount*jcount;
+      collapse_index_rangeB = (icount+2)*(jcount+2);
+//      std::cout << "   LayoutRight " << std::endl;
+    } else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value ) {
+      collapse_index_rangeA = kcount*jcount;
+      collapse_index_rangeB = (kcount+2)*(jcount+2);
+//      std::cout << "   LayoutLeft " << std::endl;
+    } else {
+      std::cout << "  LayoutRight or LayoutLeft required - will pass 0 as range instead " << std::endl;
+      exit(-1);
+    }
+
+    Kokkos::RangePolicy<execution_space> policy(0, (collapse_index_rangeA) );
+    Kokkos::RangePolicy<execution_space> policy_initB(0, (collapse_index_rangeB) );
+
+    double dt_min = 0;
+
+    Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
+    execution_space::fence();
+    Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
+    execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - first iteration only
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  flat Ahost = " << Ahost(l,j,k) << "  expected = " << check  << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " RP collapse2: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " RP collapse2: Pass! " << std::endl; }
+      }
+    }
+
+    return dt_min;
+  } 
+
+};
+
+
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct RangePolicyCollapseAll
+{
+  // RangePolicy for 3D range, but will collapse all dims
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+  typedef TestLayout layout;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  RangePolicyCollapseAll(view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long r) const
+  {
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+    {
+      long i = int(r / (jrange*krange)); 
+      long j = int(( r - i*jrange*krange)/krange);
+      long k = int(r - i*jrange*krange - j*krange);
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+            + B(i,j+2,k) + B(i,j+1,k)
+            + B(i,j,k+2) + B(i,j,k+1)
+            + B(i,j,k) );
+    }
+    else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+    {
+      long k = int(r / (irange*jrange)); 
+      long j = int(( r - k*irange*jrange)/irange);
+      long i = int(r - k*irange*jrange - j*irange);
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+            + B(i,j+2,k) + B(i,j+1,k)
+            + B(i,j,k+2) + B(i,j,k+1)
+            + B(i,j,k) );
+    }
+  }
+
+
+  struct Init
+  {
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long r) const
+    {
+      if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+      {
+        long i = int(r / (jrange*krange)); 
+        long j = int(( r - i*jrange*krange)/krange);
+        long k = int(r - i*jrange*krange - j*krange);
+        input(i,j,k) = 1;
+      }
+      else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+      {
+        long k = int(r / (irange*jrange));
+        long j = int(( r - k*irange*jrange)/irange);
+        long i = int(r - k*irange*jrange - j*irange);
+        input(i,j,k) = 1;
+      }
+    }
+  };
+
+
+  static double test_collapse_all(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
+  {
+    //This test refers to collapsing all dims using the RangePolicy
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef RangePolicyCollapseAll<execution_space,ScalarType,TestLayout> FunctorType;
+
+    const long flat_index_range = icount*jcount*kcount;
+    Kokkos::RangePolicy<execution_space> policy(0, flat_index_range );
+    Kokkos::RangePolicy<execution_space> policy_initB(0, (icount+2)*(jcount+2)*(kcount+2) );
+
+    double dt_min = 0;
+
+    Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
+    execution_space::fence();
+    Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
+    execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - first iteration only
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Callapse ALL Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  flat Ahost = " << Ahost(l,j,k) << "  expected = " << check  << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " RP collapse all: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " RP collapse all: Pass! " << std::endl; }
+      }
+    }
+
+    return dt_min;
+  } 
+
+};
+
+} //end namespace Test
diff --git a/packages/kokkos/core/perf_test/PerfTestMain.cpp b/packages/kokkos/core/perf_test/PerfTestMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1f8a50f1b35fd1f3935e4097d679aa7b653a14b
--- /dev/null
+++ b/packages/kokkos/core/perf_test/PerfTestMain.cpp
@@ -0,0 +1,80 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <cstdlib>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+int command_line_num_args(int n = 0) {
+  static int n_args = 0;
+  if(n>0)
+    n_args = n;
+  return n_args;
+}
+
+const char* command_line_arg(int k, char** input_args = NULL) {
+  static char** args;
+  if(input_args != NULL)
+    args = input_args;
+  if(command_line_num_args() > k)
+    return args[k];
+  else
+    return NULL;
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  Kokkos::initialize(argc,argv);
+
+  (void) Test::command_line_num_args(argc);
+  (void) Test::command_line_arg(0,argv);
+
+  int result = RUN_ALL_TESTS();
+
+  Kokkos::finalize();
+  return result;
+}
diff --git a/packages/kokkos/core/perf_test/PerfTest_Category.hpp b/packages/kokkos/core/perf_test/PerfTest_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9126457bb060bacd86f2226c3acfb1f5eb33e586
--- /dev/null
+++ b/packages/kokkos/core/perf_test/PerfTest_Category.hpp
@@ -0,0 +1,68 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+extern int command_line_num_args(int n = 0);
+extern const char* command_line_arg(int k, char** input_args = NULL);
+
+class default_exec : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY default_exec
+#define TEST_EXECSPACE Kokkos::DefaultExecutionSpace
+
+#endif
diff --git a/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72aea39f516f7722cb39d83e86f19be0a46191e7
--- /dev/null
+++ b/packages/kokkos/core/perf_test/PerfTest_CustomReduction.cpp
@@ -0,0 +1,115 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <gtest/gtest.h>
+#include <PerfTest_Category.hpp>
+#include <Kokkos_Random.hpp>
+
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+namespace Test {
+template<class Scalar>
+void custom_reduction_test(int N, int R, int num_trials) {
+  Kokkos::Random_XorShift64_Pool<> rand_pool(183291);
+  Kokkos::View<Scalar*> a("A",N);
+  Kokkos::fill_random(a,rand_pool,1.0);
+
+  Scalar max;
+
+  // Warm up
+  Kokkos::parallel_reduce(Kokkos::TeamPolicy<>(N/1024,32), KOKKOS_LAMBDA( const Kokkos::TeamPolicy<>::member_type& team, Scalar& lmax) {
+    Scalar team_max = Scalar(0);
+    for(int rr = 0; rr<R; rr++) {
+    int i = team.league_rank();
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,32), [&] (const int& j, Scalar& thread_max) {
+      Scalar t_max = Scalar(0);
+      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,32), [&] (const int& k, Scalar& max_) {
+        const Scalar val =  a((i*32 + j)*32 + k);
+        if(val>lmax) lmax = val;
+        if((k == 11) && (j==17) && (i==2)) lmax = 11.5;
+      },Kokkos::Experimental::Max<Scalar>(t_max));
+      if(t_max>thread_max) thread_max = t_max;
+    },Kokkos::Experimental::Max<Scalar>(team_max));
+    }
+    if(team_max>lmax) lmax = team_max;
+  },Kokkos::Experimental::Max<Scalar>(max));
+
+  // Timing
+  Kokkos::Timer timer;
+  for(int r = 0; r<num_trials; r++) {
+    Kokkos::parallel_reduce(Kokkos::TeamPolicy<>(N/1024,32), KOKKOS_LAMBDA( const Kokkos::TeamPolicy<>::member_type& team, Scalar& lmax) {
+      Scalar team_max = Scalar(0);
+      for(int rr = 0; rr<R; rr++) {
+      int i = team.league_rank();
+      Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,32), [&] (const int& j, Scalar& thread_max) {
+        Scalar t_max = Scalar(0);
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,32), [&] (const int& k, Scalar& max_) {
+          const Scalar val =  a((i*32 + j)*32 + k);
+          if(val>lmax) lmax = val;
+          if((k == 11) && (j==17) && (i==2)) lmax = 11.5;
+        },Kokkos::Experimental::Max<Scalar>(t_max));
+        if(t_max>thread_max) thread_max = t_max;
+      },Kokkos::Experimental::Max<Scalar>(team_max));
+      }
+      if(team_max>lmax) lmax = team_max;
+    },Kokkos::Experimental::Max<Scalar>(max));
+  }
+  double time = timer.seconds();
+  printf("%e %e %e\n",time,1.0*N*R*num_trials*sizeof(Scalar)/time/1024/1024/1024,max);
+}
+
+TEST_F( default_exec, custom_reduction ) {
+  int N = 100000;
+  int R = 1000;
+  int num_trials = 1;
+
+  if(command_line_num_args()>1)
+    N = atoi(command_line_arg(1));
+  if(command_line_num_args()>2)
+    R = atoi(command_line_arg(2));
+  if(command_line_num_args()>3)
+    num_trials = atoi(command_line_arg(3));
+  custom_reduction_test<double>(N,R,num_trials);
+}
+}
+#endif
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewCopy.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewCopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7c53b84988db8ac6c3e3b3f5d7133aa7f8d1e3e
--- /dev/null
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewCopy.cpp
@@ -0,0 +1,445 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <gtest/gtest.h>
+#include <cstdio>
+#include <PerfTest_Category.hpp>
+
+namespace Test {
+
+template<class ViewType>
+double fill_view (ViewType& a, typename ViewType::const_value_type& val, int repeat){
+  Kokkos::Timer timer;
+  for(int i=0; i<repeat; i++) {
+    Kokkos::deep_copy(a,val);
+  }
+  Kokkos::fence();
+  return timer.seconds();
+}
+
+
+template<class Layout>
+void run_fillview_tests(int N, int R) {
+  const int N1 = N;
+  const int N2 = N*N;
+  const int N3 = N2*N;
+  const int N4 = N2*N2;
+  const int N8 = N4*N4;
+
+  double time1,time2,time3,time4,time5,time6,time7,time8,time_raw = 100000.0;
+  {
+    Kokkos::View<double*,Layout> a("A1",N8);
+    time1 = fill_view(a,1.1,R)/R;
+  }
+  {
+    Kokkos::View<double**,Layout> a("A2",N4,N4);
+    time2 = fill_view(a,1.1,R)/R;
+  }
+  {
+    Kokkos::View<double***,Layout> a("A3",N3,N3,N2);
+    time3 = fill_view(a,1.1,R)/R;
+  }
+  {
+    Kokkos::View<double****,Layout> a("A4",N2,N2,N2,N2);
+    time4 = fill_view(a,1.1,R)/R;
+  }
+  {
+    Kokkos::View<double*****,Layout> a("A5",N2,N2,N1,N1,N2);
+    time5 = fill_view(a,1.1,R)/R;
+  }
+  {
+    Kokkos::View<double******,Layout> a("A6",N2,N1,N1,N1,N1,N2);
+    time6 = fill_view(a,1.1,R)/R;
+  }
+  {
+    Kokkos::View<double*******,Layout> a("A7",N2,N1,N1,N1,N1,N1,N1);
+    time7 = fill_view(a,1.1,R)/R;
+  }
+  {
+    Kokkos::View<double********,Layout> a("A8",N1,N1,N1,N1,N1,N1,N1,N1);
+    time8 = fill_view(a,1.1,R)/R;
+  }
+  #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+  {
+    Kokkos::View<double*,Layout> a("A1",N8);
+    double* a_ptr = a.data();
+    Kokkos::Timer timer;
+    for(int r=0;r<R;r++) {
+      Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
+        a_ptr[i] = 1.1;
+      });
+    }
+    time_raw = timer.seconds()/R;
+  }
+  #endif
+  double size = 1.0*N8*8/1024/1024;
+  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n",time_raw,size,size/1024/time_raw);
+  printf("   Rank1: %lf s   %lf MB   %lf GB/s\n",time1,size,size/1024/time1);
+  printf("   Rank2: %lf s   %lf MB   %lf GB/s\n",time2,size,size/1024/time2);
+  printf("   Rank3: %lf s   %lf MB   %lf GB/s\n",time3,size,size/1024/time3);
+  printf("   Rank4: %lf s   %lf MB   %lf GB/s\n",time4,size,size/1024/time4);
+  printf("   Rank5: %lf s   %lf MB   %lf GB/s\n",time5,size,size/1024/time5);
+  printf("   Rank6: %lf s   %lf MB   %lf GB/s\n",time6,size,size/1024/time6);
+  printf("   Rank7: %lf s   %lf MB   %lf GB/s\n",time7,size,size/1024/time7);
+  printf("   Rank8: %lf s   %lf MB   %lf GB/s\n",time8,size,size/1024/time8);
+}
+
+TEST_F( default_exec, ViewFill ) {
+  printf("ViewFill Performance for LayoutLeft:\n");
+  run_fillview_tests<Kokkos::LayoutLeft>(10,1);
+  printf("ViewFill Performance for LayoutRight:\n");
+  run_fillview_tests<Kokkos::LayoutRight>(10,1);
+}
+
+template<class Layout>
+void run_allocateview_tests(int N, int R) {
+  const int N1 = N;
+  const int N2 = N*N;
+  const int N3 = N2*N;
+  const int N4 = N2*N2;
+  const int N8 = N4*N4;
+
+  double time1,time2,time3,time4,time5,time6,time7,time8,time_raw = 100000.0;
+  {
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double*,Layout> a("A1",N8);
+    }
+    time1 = timer.seconds()/R;
+  }
+  {
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double**,Layout> a("A2",N4,N4);
+    }
+    time2 = timer.seconds()/R;
+  }
+  {
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double***,Layout> a("A3",N3,N3,N2);
+    }
+    time3 = timer.seconds()/R;
+  }
+  {
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double****,Layout> a("A4",N2,N2,N2,N2);
+    }
+    time4 = timer.seconds()/R;
+  }
+  {
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double*****,Layout> a("A5",N2,N2,N1,N1,N2);
+    }
+    time5 = timer.seconds()/R;
+  }
+  {
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double******,Layout> a("A6",N2,N1,N1,N1,N1,N2);
+    }
+    time6 = timer.seconds()/R;
+  }
+  {
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double*******,Layout> a("A7",N2,N1,N1,N1,N1,N1,N1);
+    }
+    time7 = timer.seconds()/R;
+  }
+  {
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double********,Layout> a("A8",N1,N1,N1,N1,N1,N1,N1,N1);
+    }
+    time8 = timer.seconds()/R;
+  }
+  #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+  {
+    Kokkos::Timer timer;
+    for(int r=0;r<R;r++) {
+      double* a_ptr = (double*) Kokkos::kokkos_malloc("A", sizeof(double)*N8);
+      Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
+        a_ptr[i] = 0.0;
+      });
+      Kokkos::kokkos_free(a_ptr);
+    }
+    time_raw = timer.seconds()/R;
+  }
+  #endif
+  double size = 1.0*N8*8/1024/1024;
+  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n",time_raw,size,size/1024/time_raw);
+  printf("   Rank1: %lf s   %lf MB   %lf GB/s\n",time1,size,size/1024/time1);
+  printf("   Rank2: %lf s   %lf MB   %lf GB/s\n",time2,size,size/1024/time2);
+  printf("   Rank3: %lf s   %lf MB   %lf GB/s\n",time3,size,size/1024/time3);
+  printf("   Rank4: %lf s   %lf MB   %lf GB/s\n",time4,size,size/1024/time4);
+  printf("   Rank5: %lf s   %lf MB   %lf GB/s\n",time5,size,size/1024/time5);
+  printf("   Rank6: %lf s   %lf MB   %lf GB/s\n",time6,size,size/1024/time6);
+  printf("   Rank7: %lf s   %lf MB   %lf GB/s\n",time7,size,size/1024/time7);
+  printf("   Rank8: %lf s   %lf MB   %lf GB/s\n",time8,size,size/1024/time8);
+}
+
+TEST_F( default_exec, ViewCreate ) {
+  printf("Create View Performance for LayoutLeft:\n");
+  run_allocateview_tests<Kokkos::LayoutLeft>(10,1);
+  printf("Create View Performance for LayoutRight:\n");
+  run_allocateview_tests<Kokkos::LayoutRight>(10,1);
+}
+
+template<class ViewTypeA, class ViewTypeB>
+double deepcopy_view (ViewTypeA& a, ViewTypeB& b, int repeat){
+  Kokkos::Timer timer;
+  for(int i=0; i<repeat; i++) {
+    Kokkos::deep_copy(a,b);
+  }
+  Kokkos::fence();
+  return timer.seconds();
+}
+
+
+template<class LayoutA, class LayoutB>
+void run_deepcopyview_tests(int N, int R) {
+  const int N1 = N;
+  const int N2 = N*N;
+  const int N3 = N2*N;
+  const int N4 = N2*N2;
+  const int N8 = N4*N4;
+
+  double time1,time2,time3,time4,time5,time6,time7,time8,time_raw = 100000.0;
+  {
+    Kokkos::View<double*,LayoutA> a("A1",N8);
+    Kokkos::View<double*,LayoutB> b("B1",N8);
+    time1 = deepcopy_view(a,b,R)/R;
+  }
+  {
+    Kokkos::View<double**,LayoutA> a("A2",N4,N4);
+    Kokkos::View<double**,LayoutB> b("B2",N4,N4);
+    time2 = deepcopy_view(a,b,R)/R;
+  }
+  {
+    Kokkos::View<double***,LayoutA> a("A3",N3,N3,N2);
+    Kokkos::View<double***,LayoutB> b("B3",N3,N3,N2);
+    time3 = deepcopy_view(a,b,R)/R;
+  }
+  {
+    Kokkos::View<double****,LayoutA> a("A4",N2,N2,N2,N2);
+    Kokkos::View<double****,LayoutB> b("B4",N2,N2,N2,N2);
+    time4 = deepcopy_view(a,b,R)/R;
+  }
+  {
+    Kokkos::View<double*****,LayoutA> a("A5",N2,N2,N1,N1,N2);
+    Kokkos::View<double*****,LayoutB> b("B5",N2,N2,N1,N1,N2);
+    time5 = deepcopy_view(a,b,R)/R;
+  }
+  {
+    Kokkos::View<double******,LayoutA> a("A6",N2,N1,N1,N1,N1,N2);
+    Kokkos::View<double******,LayoutB> b("B6",N2,N1,N1,N1,N1,N2);
+    time6 = deepcopy_view(a,b,R)/R;
+  }
+  {
+    Kokkos::View<double*******,LayoutA> a("A7",N2,N1,N1,N1,N1,N1,N1);
+    Kokkos::View<double*******,LayoutB> b("B7",N2,N1,N1,N1,N1,N1,N1);
+    time7 = deepcopy_view(a,b,R)/R;
+  }
+  {
+    Kokkos::View<double********,LayoutA> a("A8",N1,N1,N1,N1,N1,N1,N1,N1);
+    Kokkos::View<double********,LayoutB> b("B8",N1,N1,N1,N1,N1,N1,N1,N1);
+    time8 = deepcopy_view(a,b,R)/R;
+  }
+  #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+  {
+    Kokkos::View<double*,LayoutA> a("A1",N8);
+    Kokkos::View<double*,LayoutB> b("B1",N8);
+    double* const a_ptr = a.data();
+    const double* const b_ptr = b.data();
+    Kokkos::Timer timer;
+    for(int r=0;r<R;r++) {
+      Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
+        a_ptr[i] = b_ptr[i];
+      });
+    }
+    time_raw = timer.seconds()/R;
+  }
+  #endif
+  double size = 1.0*N8*8/1024/1024;
+  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n",time_raw,size,2.0*size/1024/time_raw);
+  printf("   Rank1: %lf s   %lf MB   %lf GB/s\n",time1,size,2.0*size/1024/time1);
+  printf("   Rank2: %lf s   %lf MB   %lf GB/s\n",time2,size,2.0*size/1024/time2);
+  printf("   Rank3: %lf s   %lf MB   %lf GB/s\n",time3,size,2.0*size/1024/time3);
+  printf("   Rank4: %lf s   %lf MB   %lf GB/s\n",time4,size,2.0*size/1024/time4);
+  printf("   Rank5: %lf s   %lf MB   %lf GB/s\n",time5,size,2.0*size/1024/time5);
+  printf("   Rank6: %lf s   %lf MB   %lf GB/s\n",time6,size,2.0*size/1024/time6);
+  printf("   Rank7: %lf s   %lf MB   %lf GB/s\n",time7,size,2.0*size/1024/time7);
+  printf("   Rank8: %lf s   %lf MB   %lf GB/s\n",time8,size,2.0*size/1024/time8);
+}
+
+TEST_F( default_exec, ViewDeepCopy ) {
+  printf("DeepCopy Performance for LayoutLeft to LayoutLeft:\n");
+  run_deepcopyview_tests<Kokkos::LayoutLeft,Kokkos::LayoutLeft>(10,1);
+  printf("DeepCopy Performance for LayoutRight to LayoutRight:\n");
+  run_deepcopyview_tests<Kokkos::LayoutRight,Kokkos::LayoutRight>(10,1);
+  printf("DeepCopy Performance for LayoutLeft to LayoutRight:\n");
+  run_deepcopyview_tests<Kokkos::LayoutLeft,Kokkos::LayoutRight>(10,1);
+  printf("DeepCopy Performance for LayoutRight to LayoutLeft:\n");
+  run_deepcopyview_tests<Kokkos::LayoutRight,Kokkos::LayoutLeft>(10,1);
+}
+
+template<class Layout>
+void run_resizeview_tests(int N, int R) {
+  const int N1 = N;
+  const int N2 = N*N;
+  const int N3 = N2*N;
+  const int N4 = N2*N2;
+  const int N8 = N4*N4;
+
+  double time1,time2,time3,time4,time5,time6,time7,time8,time_raw = 100000.0;
+  {
+    Kokkos::View<double*,Layout> a("A1",N8);
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double*,Layout> a_(a);
+      Kokkos::resize(a_,int(N8*1.1));
+    }
+    time1 = timer.seconds()/R;
+  }
+  {
+    Kokkos::View<double**,Layout> a("A2",N4,N4);
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double**,Layout> a_(a);
+      Kokkos::resize(a_,int(N4*1.1),N4);
+    }
+    time2 = timer.seconds()/R;
+  }
+  {
+    Kokkos::View<double***,Layout> a("A3",N3,N3,N2);
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double***,Layout> a_(a);
+      Kokkos::resize(a_,int(N3*1.1),N3,N2);
+    }
+    time3 = timer.seconds()/R;
+  }
+  {
+    Kokkos::View<double****,Layout> a("A4",N2,N2,N2,N2);
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double****,Layout> a_(a);
+      Kokkos::resize(a_,int(N2*1.1),N2,N2,N2);
+    }
+    time4 = timer.seconds()/R;
+  }
+  {
+    Kokkos::View<double*****,Layout> a("A5",N2,N2,N1,N1,N2);
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double*****,Layout> a_(a);
+      Kokkos::resize(a_,int(N2*1.1),N2,N1,N1,N2);
+    }
+    time5 = timer.seconds()/R;
+  }
+  {
+    Kokkos::View<double******,Layout> a("A6",N2,N1,N1,N1,N1,N2);
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double******,Layout> a_(a);
+      Kokkos::resize(a_,int(N2*1.1),N1,N1,N1,N1,N2);
+    }
+    time6 = timer.seconds()/R;
+  }
+  {
+    Kokkos::View<double*******,Layout> a("A7",N2,N1,N1,N1,N1,N1,N1);
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double*******,Layout> a_(a);
+      Kokkos::resize(a_,int(N2*1.1),N1,N1,N1,N1,N1,N1);
+    }
+    time7 = timer.seconds()/R;
+  }
+  {
+    Kokkos::View<double********,Layout> a("A8",N1,N1,N1,N1,N1,N1,N1,N1);
+    Kokkos::Timer timer;
+    for(int r=0; r<R; r++) {
+      Kokkos::View<double********,Layout> a_(a);
+      Kokkos::resize(a_,int(N1*1.1),N1,N1,N1,N1,N1,N1,N1);
+    }
+    time8 = timer.seconds()/R;
+  }
+  #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+  {
+    Kokkos::View<double*,Layout> a("A1",N8);
+    double* a_ptr = a.data();
+    Kokkos::Timer timer;
+    for(int r=0;r<R;r++) {
+      Kokkos::View<double*,Layout> a1(Kokkos::ViewAllocateWithoutInitializing("A1"),int(N8*1.1));
+      double* a1_ptr = a1.data();
+      Kokkos::parallel_for(N8, KOKKOS_LAMBDA (const int& i) {
+        a1_ptr[i] = a_ptr[i];
+      });
+    }
+    time_raw = timer.seconds()/R;
+  }
+  #endif
+  double size = 1.0*N8*8/1024/1024;
+  printf("   Raw:   %lf s   %lf MB   %lf GB/s\n",time_raw,size,2.0*size/1024/time_raw);
+  printf("   Rank1: %lf s   %lf MB   %lf GB/s\n",time1,size,2.0*size/1024/time1);
+  printf("   Rank2: %lf s   %lf MB   %lf GB/s\n",time2,size,2.0*size/1024/time2);
+  printf("   Rank3: %lf s   %lf MB   %lf GB/s\n",time3,size,2.0*size/1024/time3);
+  printf("   Rank4: %lf s   %lf MB   %lf GB/s\n",time4,size,2.0*size/1024/time4);
+  printf("   Rank5: %lf s   %lf MB   %lf GB/s\n",time5,size,2.0*size/1024/time5);
+  printf("   Rank6: %lf s   %lf MB   %lf GB/s\n",time6,size,2.0*size/1024/time6);
+  printf("   Rank7: %lf s   %lf MB   %lf GB/s\n",time7,size,2.0*size/1024/time7);
+  printf("   Rank8: %lf s   %lf MB   %lf GB/s\n",time8,size,2.0*size/1024/time8);
+}
+
+TEST_F( default_exec, ViewResize ) {
+  printf("Resize View Performance for LayoutLeft:\n");
+  run_resizeview_tests<Kokkos::LayoutLeft>(10,1);
+  printf("Resize View Performance for LayoutRight:\n");
+  run_resizeview_tests<Kokkos::LayoutRight>(10,1);
+}
+
+}
diff --git a/packages/kokkos/core/perf_test/run_mempool.sh b/packages/kokkos/core/perf_test/run_mempool.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e9b42c5a53fad43e2fc486fbf500660d1e932c97
--- /dev/null
+++ b/packages/kokkos/core/perf_test/run_mempool.sh
@@ -0,0 +1,25 @@
+#!/bin/bash -e
+NT=$1
+PROG="./KokkosCore_PerformanceTest_Mempool"
+COMMON_ARGS="--kokkos-threads=$NT --fill_stride=1 --fill_level=70 --chunk_span=5 --repeat_inner=100"
+
+postproc() {
+cat log | head -n 1 | rev | cut -d ' ' -f 1 | rev >> xvals
+cat log | tail -n 1 | rev | cut -d ' ' -f 1 | rev >> yvals
+}
+
+for yset in 1 2 3
+do
+  rm -f xvals yvals
+  for x in 1 2 4 8 16 32
+  do
+    echo "yset $yset x factor $x"
+    $PROG $COMMON_ARGS --alloc_size=`expr $x \* 1000000` --super_size=`expr $x \* 100000` > log
+    postproc
+  done
+  rm -f yvals$yset
+  mv yvals yvals$yset
+done
+
+rm -f datapoints
+paste -d',' xvals yvals1 yvals2 yvals3 > datapoints
diff --git a/packages/kokkos/core/perf_test/run_mempool_fill.sh b/packages/kokkos/core/perf_test/run_mempool_fill.sh
new file mode 100755
index 0000000000000000000000000000000000000000..cdd756b4873915a99d4531e260704640e7749fee
--- /dev/null
+++ b/packages/kokkos/core/perf_test/run_mempool_fill.sh
@@ -0,0 +1,21 @@
+#!/bin/bash -e
+NT=$1
+PROG="./KokkosCore_PerformanceTest_Mempool"
+COMMON_ARGS="--kokkos-threads=$NT --fill_stride=1 --alloc_size=10027008 --super_size=65536 --repeat_inner=100 --chunk_span=4 --repeat_outer=10"
+
+postproc() {
+cat log | grep "fill ops per second" | rev | cut -d ' ' -f 2 | rev >> yvals_fill
+cat log | grep "cycle ops per second" | rev | cut -d ' ' -f 2 | rev >> yvals_cycle
+}
+
+rm -f xvals yvals_fill yvals_cycle
+for x in 75 95
+do
+  echo "test fill level $x"
+  echo $x >> xvals
+  $PROG $COMMON_ARGS --fill_level=$x 2>&1 | tee log
+  postproc
+done
+
+rm -f datapoints
+paste xvals yvals_fill yvals_cycle > datapoints.txt
diff --git a/packages/kokkos/core/perf_test/run_taskdag.sh b/packages/kokkos/core/perf_test/run_taskdag.sh
new file mode 100755
index 0000000000000000000000000000000000000000..dcb016c9d54cc5a8111f07b47c6d769098681253
--- /dev/null
+++ b/packages/kokkos/core/perf_test/run_taskdag.sh
@@ -0,0 +1,21 @@
+#!/bin/bash -e
+NT=$1
+PROG="./KokkosCore_PerformanceTest_TaskDAG"
+COMMON_ARGS="--kokkos-threads=$NT --alloc_size=10027008 --super_size=65536 --repeat_outer=10"
+
+postproc() {
+cat log | grep "tasks per second" | rev | cut -d ' ' -f 2 | rev >> yvals
+}
+
+rm -f xvals yvals
+for x in 21 23
+do
+  echo "test input $x"
+  echo $x >> xvals
+  $PROG $COMMON_ARGS --input=$x 2>&1 | tee log
+  postproc
+done
+
+rm -f datapoints.txt
+paste xvals yvals > datapoints.txt
+
diff --git a/packages/kokkos/core/perf_test/test_atomic.cpp b/packages/kokkos/core/perf_test/test_atomic.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6bb22e4e301aa3ef190e17fb9f292f50bd8c3d69
--- /dev/null
+++ b/packages/kokkos/core/perf_test/test_atomic.cpp
@@ -0,0 +1,507 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+typedef Kokkos::DefaultExecutionSpace exec_space;
+
+#define RESET		0
+#define BRIGHT 		1
+#define DIM		2
+#define UNDERLINE 	3
+#define BLINK		4
+#define REVERSE		7
+#define HIDDEN		8
+
+#define BLACK 		0
+#define RED		1
+#define GREEN		2
+#define YELLOW		3
+#define BLUE		4
+#define MAGENTA		5
+#define CYAN		6
+#define GREY		7
+#define	WHITE		8
+
+void textcolor(int attr, int fg, int bg)
+{	char command[13];
+
+	/* Command is the control command to the terminal */
+	sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
+	printf("%s", command);
+}
+void textcolor_standard() {textcolor(RESET, BLACK, WHITE);}
+
+
+template<class T,class DEVICE_TYPE>
+struct ZeroFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<T,execution_space> type;
+  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  type data;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    data() = 0;
+  }
+};
+
+//---------------------------------------------------
+//--------------atomic_fetch_add---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct AddFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    Kokkos::atomic_fetch_add(&data(),(T)1);
+  }
+};
+
+template<class T>
+T AddLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct AddFunctor<T,exec_space> f_add;
+  f_add.data = data;
+  Kokkos::parallel_for(loop,f_add);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct AddNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    data()+=(T)1;
+  }
+};
+
+template<class T>
+T AddLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct AddNonAtomicFunctor<T,exec_space> f_add;
+  f_add.data = data;
+  Kokkos::parallel_for(loop,f_add);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T>
+T AddLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++)
+  *data+=(T)1;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct CASFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	  T old = data();
+	  T newval, assumed;
+	  do {
+	    assumed = old;
+	    newval = assumed + (T)1;
+	    old = Kokkos::atomic_compare_exchange(&data(), assumed, newval);
+	  }
+	  while( old != assumed );
+  }
+};
+
+template<class T>
+T CASLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct CASFunctor<T,exec_space> f_cas;
+  f_cas.data = data;
+  Kokkos::parallel_for(loop,f_cas);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct CASNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	  volatile T assumed;
+	  volatile T newval;
+	  bool fail=1;
+	  do {
+	    assumed = data();
+	    newval = assumed + (T)1;
+	    if(data()==assumed) {
+	    	data() = newval;
+	    	fail = 0;
+	    }
+	  }
+	  while(fail);
+  }
+};
+
+template<class T>
+T CASLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct CASNonAtomicFunctor<T,exec_space> f_cas;
+  f_cas.data = data;
+  Kokkos::parallel_for(loop,f_cas);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T>
+T CASLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++) {
+	  T assumed;
+	  T newval;
+	  T old;
+	  do {
+	    assumed = *data;
+	    newval = assumed + (T)1;
+	    old = *data;
+	    *data = newval;
+	  }
+	  while(!(assumed==old));
+  }
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct ExchFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data, data2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	T old = Kokkos::atomic_exchange(&data(),(T)i);
+    Kokkos::atomic_fetch_add(&data2(),old);
+  }
+};
+
+template<class T>
+T ExchLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  typename ZeroFunctor<T,exec_space>::type data2("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
+  f_zero.data = data2;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct ExchFunctor<T,exec_space> f_exch;
+  f_exch.data = data;
+  f_exch.data2 = data2;
+  Kokkos::parallel_for(loop,f_exch);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy(h_data2,data2);
+  T val = h_data() + h_data2();
+
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct ExchNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data, data2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+		T old = data();
+		data()=(T) i;
+		data2()+=old;
+  }
+};
+
+
+template<class T>
+T ExchLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  typename ZeroFunctor<T,exec_space>::type data2("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
+  f_zero.data = data2;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct ExchNonAtomicFunctor<T,exec_space> f_exch;
+  f_exch.data = data;
+  f_exch.data2 = data2;
+  Kokkos::parallel_for(loop,f_exch);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy(h_data2,data2);
+  T val = h_data() + h_data2();
+
+  return val;
+}
+
+template<class T>
+T ExchLoopSerial(int loop) {
+  T* data = new T[1];
+  T* data2 = new T[1];
+  data[0] = 0;
+  data2[0] = 0;
+  for(int i=0;i<loop;i++) {
+	T old = *data;
+	*data=(T) i;
+	*data2+=old;
+  }
+
+  T val = *data2 + *data;
+  delete [] data;
+  delete [] data2;
+  return val;
+}
+
+template<class T>
+T LoopVariant(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoop<T>(loop);
+    case 2: return CASLoop<T>(loop);
+    case 3: return ExchLoop<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+T LoopVariantSerial(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoopSerial<T>(loop);
+    case 2: return CASLoopSerial<T>(loop);
+    case 3: return ExchLoopSerial<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+T LoopVariantNonAtomic(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoopNonAtomic<T>(loop);
+    case 2: return CASLoopNonAtomic<T>(loop);
+    case 3: return ExchLoopNonAtomic<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+void Loop(int loop, int test, const char* type_name) {
+  LoopVariant<T>(loop,test);
+
+  Kokkos::Impl::Timer timer;
+  T res = LoopVariant<T>(loop,test);
+  double time = timer.seconds();
+
+  timer.reset();
+  T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
+  double timeNonAtomic = timer.seconds();
+
+  timer.reset();
+  T resSerial = LoopVariantSerial<T>(loop,test);
+  double timeSerial = timer.seconds();
+
+  time         *=1e6/loop;
+  timeNonAtomic*=1e6/loop;
+  timeSerial   *=1e6/loop;
+  //textcolor_standard();
+  bool passed = true;
+  if(resSerial!=res) passed = false;
+  //if(!passed) textcolor(RESET,BLACK,YELLOW);
+  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",
+         type_name,test,passed?"PASSED":"FAILED",loop,
+         1.0*resSerial,1.0*res,1.0*resNonAtomic,
+         timeSerial,time,timeNonAtomic,(int)sizeof(T));
+  //if(!passed) textcolor_standard();
+  printf("\n");
+}
+
+
+template<class T>
+void Test(int loop, int test, const char* type_name) {
+  if(test==-1) {
+    Loop<T>(loop,1,type_name);
+    Loop<T>(loop,2,type_name);
+    Loop<T>(loop,3,type_name);
+
+  }
+  else
+    Loop<T>(loop,test,type_name);
+}
+
+int main(int argc, char* argv[])
+{
+  int type = -1;
+  int loop = 100000;
+  int test = -1;
+
+  for(int i=0;i<argc;i++)
+  {
+     if((strcmp(argv[i],"--test")==0)) {test=atoi(argv[++i]); continue;}
+     if((strcmp(argv[i],"--type")==0)) {type=atoi(argv[++i]); continue;}
+     if((strcmp(argv[i],"-l")==0)||(strcmp(argv[i],"--loop")==0)) {loop=atoi(argv[++i]); continue;}
+  }
+
+
+  Kokkos::initialize(argc,argv);
+
+
+  printf("Using %s\n",Kokkos::atomic_query_version());
+  bool all_tests = false;
+  if(type==-1) all_tests = true;
+  while(type<100) {
+    if(type==1) {
+     Test<int>(loop,test,"int                    ");
+    }
+    if(type==2) {
+     Test<long int>(loop,test,"long int               ");
+    }
+    if(type==3) {
+     Test<long long int>(loop,test,"long long int          ");
+    }
+    if(type==4) {
+     Test<unsigned int>(loop,test,"unsigned int           ");
+    }
+    if(type==5) {
+     Test<unsigned long int>(loop,test,"unsigned long int      ");
+    }
+    if(type==6) {
+     Test<unsigned long long int>(loop,test,"unsigned long long int ");
+    }
+    if(type==10) {
+     //Test<float>(loop,test,"float                  ");
+    }
+    if(type==11) {
+     Test<double>(loop,test,"double                 ");
+    }
+    if(!all_tests) type=100;
+    else type++;
+  }
+
+  Kokkos::finalize();
+
+}
+
diff --git a/packages/kokkos/core/perf_test/test_mempool.cpp b/packages/kokkos/core/perf_test/test_mempool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9fd58eda9145fd327b4fbf21e951b9bc9a9ad868
--- /dev/null
+++ b/packages/kokkos/core/perf_test/test_mempool.cpp
@@ -0,0 +1,357 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <limits>
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+using ExecSpace   = Kokkos::DefaultExecutionSpace ;
+using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space ;
+
+using MemoryPool = Kokkos::MemoryPool< ExecSpace > ;
+
+struct TestFunctor {
+
+  typedef Kokkos::View< uintptr_t * , ExecSpace >  ptrs_type ;
+
+  enum : unsigned { chunk = 32 };
+
+  MemoryPool  pool ;
+  ptrs_type   ptrs ;
+  unsigned    chunk_span ;
+  unsigned    fill_stride ;
+  unsigned    range_iter ;
+  unsigned    repeat_inner ;
+
+  TestFunctor( size_t    total_alloc_size
+             , unsigned  min_superblock_size
+             , unsigned  number_alloc
+             , unsigned  arg_stride_alloc
+             , unsigned  arg_chunk_span
+             , unsigned  arg_repeat )
+    : pool()
+    , ptrs()
+    , chunk_span(0)
+    , fill_stride(0)
+    , repeat_inner(0)
+    {
+      MemorySpace m ;
+
+      const unsigned min_block_size = chunk ;
+      const unsigned max_block_size = chunk * arg_chunk_span ;
+      pool = MemoryPool( m , total_alloc_size
+                           , min_block_size
+                           , max_block_size
+                           , min_superblock_size );
+
+      ptrs = ptrs_type( Kokkos::view_alloc( m , "ptrs") , number_alloc );
+      fill_stride = arg_stride_alloc ;
+      chunk_span = arg_chunk_span ;
+      range_iter   = fill_stride * number_alloc ;
+      repeat_inner       = arg_repeat ;
+    }
+
+  //----------------------------------------
+
+  typedef long value_type ;
+
+  //----------------------------------------
+
+  struct TagFill {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TagFill , int i , value_type & update ) const noexcept
+    {
+      if ( 0 == i % fill_stride ) {
+
+        const int j = i / fill_stride ;
+
+        const unsigned size_alloc = chunk * ( 1 + ( j % chunk_span ) );
+
+        ptrs(j) = (uintptr_t) pool.allocate(size_alloc);
+
+        if ( ptrs(j) ) ++update ;
+      }
+    }
+
+  bool test_fill()
+    {
+      typedef Kokkos::RangePolicy< ExecSpace , TagFill > policy ;
+
+      long result = 0 ;
+
+      Kokkos::parallel_reduce( policy(0,range_iter), *this , result );
+
+      if ( result == long(ptrs.extent(0)) ) return true;
+      pool.print_state( std::cerr );
+      return false;
+    }
+
+  //----------------------------------------
+
+  struct TagDel {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TagDel , int i ) const noexcept
+    {
+      if ( 0 == i % fill_stride ) {
+
+        const int j = i / fill_stride ;
+
+        const unsigned size_alloc = chunk * ( 1 + ( j % chunk_span ) );
+
+        pool.deallocate( (void*) ptrs(j) , size_alloc );
+      }
+    }
+
+  void test_del()
+    {
+      typedef Kokkos::RangePolicy< ExecSpace , TagDel > policy ;
+
+      Kokkos::parallel_for( policy(0,range_iter), *this );
+    }
+
+  //----------------------------------------
+
+  struct TagAllocDealloc {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TagAllocDealloc , int i , long & update ) const noexcept
+    {
+      if ( 0 == i % fill_stride ) {
+
+        const int j = i / fill_stride ;
+
+        if ( 0 == j % 3 ) {
+
+          for ( unsigned k = 0 ; k < repeat_inner ; ++k ) {
+
+            const unsigned size_alloc = chunk * ( 1 + ( j % chunk_span ) );
+
+            pool.deallocate( (void*) ptrs(j) , size_alloc );
+
+            ptrs(j) = (uintptr_t) pool.allocate(size_alloc);
+
+            if ( 0 == ptrs(j) ) update++ ;
+          }
+        }
+      }
+    }
+
+  bool test_alloc_dealloc()
+    {
+      typedef Kokkos::RangePolicy< ExecSpace , TagAllocDealloc > policy ;
+
+      long error_count = 0 ;
+
+      Kokkos::parallel_reduce( policy(0,range_iter), *this , error_count );
+
+      return 0 == error_count ;
+    }
+};
+
+
+
+int main( int argc , char* argv[] )
+{
+  static const char help_flag[] = "--help" ;
+  static const char alloc_size_flag[]   = "--alloc_size=" ;
+  static const char super_size_flag[]   = "--super_size=" ;
+  static const char chunk_span_flag[]   = "--chunk_span=" ;
+  static const char fill_stride_flag[]  = "--fill_stride=" ;
+  static const char fill_level_flag[]   = "--fill_level=" ;
+  static const char repeat_outer_flag[] = "--repeat_outer=" ;
+  static const char repeat_inner_flag[] = "--repeat_inner=" ;
+
+  long total_alloc_size    = 1000000 ;
+  int  min_superblock_size =   10000 ;
+  int  chunk_span          =       5 ;
+  int  fill_stride        =       1 ;
+  int  fill_level         =      70 ;
+  int  repeat_outer   =       1 ;
+  int  repeat_inner   =       1 ;
+
+  int  ask_help = 0 ;
+
+  for(int i=1;i<argc;i++)
+  {
+     const char * const a = argv[i];
+
+     if ( ! strncmp(a,help_flag,strlen(help_flag) ) ) ask_help = 1 ;
+
+     if ( ! strncmp(a,alloc_size_flag,strlen(alloc_size_flag) ) )
+       total_alloc_size = atol( a + strlen(alloc_size_flag) );
+
+     if ( ! strncmp(a,super_size_flag,strlen(super_size_flag) ) )
+       min_superblock_size = atoi( a + strlen(super_size_flag) );
+
+     if ( ! strncmp(a,fill_stride_flag,strlen(fill_stride_flag) ) )
+       fill_stride = atoi( a + strlen(fill_stride_flag) );
+
+     if ( ! strncmp(a,fill_level_flag,strlen(fill_level_flag) ) )
+       fill_level = atoi( a + strlen(fill_level_flag) );
+
+     if ( ! strncmp(a,chunk_span_flag,strlen(chunk_span_flag) ) )
+       chunk_span = atoi( a + strlen(chunk_span_flag) );
+
+     if ( ! strncmp(a,repeat_outer_flag,strlen(repeat_outer_flag) ) )
+       repeat_outer = atoi( a + strlen(repeat_outer_flag) );
+
+     if ( ! strncmp(a,repeat_inner_flag,strlen(repeat_inner_flag) ) )
+       repeat_inner = atoi( a + strlen(repeat_inner_flag) );
+  }
+
+  int chunk_span_bytes = 0;
+  for (int i = 0; i < chunk_span; ++i) {
+    auto chunk_bytes = TestFunctor::chunk * ( 1 + i );
+    if (chunk_bytes < 64) chunk_bytes = 64;
+    auto block_bytes_lg2 = Kokkos::Impl::integral_power_of_two_that_contains( chunk_bytes );
+    auto block_bytes = (1 << block_bytes_lg2);
+    chunk_span_bytes += block_bytes;
+  }
+  auto actual_superblock_bytes_lg2 = Kokkos::Impl::integral_power_of_two_that_contains( min_superblock_size );
+  auto actual_superblock_bytes = (1 << actual_superblock_bytes_lg2);
+  auto superblock_mask = actual_superblock_bytes - 1;
+  auto nsuperblocks = (total_alloc_size + superblock_mask) >> actual_superblock_bytes_lg2;
+  auto actual_total_bytes = nsuperblocks * actual_superblock_bytes;
+  auto bytes_wanted = (actual_total_bytes * fill_level) / 100;
+  auto chunk_spans = bytes_wanted / chunk_span_bytes;
+  auto number_alloc = int( chunk_spans * chunk_span );
+
+  if ( ask_help ) {
+    std::cout << "command line options:"
+              << " " << help_flag
+              << " " << alloc_size_flag << "##"
+              << " " << super_size_flag << "##"
+              << " " << fill_stride_flag << "##"
+              << " " << fill_level_flag << "##"
+              << " " << chunk_span_flag << "##"
+              << " " << repeat_outer_flag << "##"
+              << " " << repeat_inner_flag << "##"
+              << std::endl ;
+    return 0;
+  }
+
+  Kokkos::initialize(argc,argv);
+
+  double sum_fill_time = 0;
+  double sum_cycle_time = 0;
+  double sum_both_time = 0;
+  double min_fill_time = std::numeric_limits<double>::max();
+  double min_cycle_time = std::numeric_limits<double>::max();
+  double min_both_time = std::numeric_limits<double>::max();
+  //one alloc in fill, alloc/dealloc pair in repeat_inner
+  for ( int i = 0 ; i < repeat_outer ; ++i ) {
+
+    TestFunctor functor( total_alloc_size
+                       , min_superblock_size
+                       , number_alloc
+                       , fill_stride
+                       , chunk_span
+                       , repeat_inner );
+
+    Kokkos::Impl::Timer timer ;
+
+    if ( ! functor.test_fill() ) {
+      Kokkos::abort("fill ");
+    }
+
+    auto t0 = timer.seconds();
+
+    if ( ! functor.test_alloc_dealloc() ) {
+      Kokkos::abort("alloc/dealloc ");
+    }
+
+    auto t1 = timer.seconds();
+    auto this_fill_time = t0;
+    auto this_cycle_time = t1 - t0;
+    auto this_both_time = t1;
+    sum_fill_time += this_fill_time;
+    sum_cycle_time += this_cycle_time;
+    sum_both_time += this_both_time;
+    min_fill_time = std::min(min_fill_time, this_fill_time);
+    min_cycle_time = std::min(min_cycle_time, this_cycle_time);
+    min_both_time = std::min(min_both_time, this_both_time);
+  }
+
+  Kokkos::finalize();
+
+  printf( "\"mempool: alloc super stride level span inner outer number\" %ld %d %d %d %d %d %d %d\n"
+        , total_alloc_size
+        , min_superblock_size
+        , fill_stride
+        , fill_level
+        , chunk_span
+        , repeat_inner
+        , repeat_outer
+        , number_alloc );
+
+  auto avg_fill_time = sum_fill_time / repeat_outer;
+  auto avg_cycle_time = sum_cycle_time / repeat_outer;
+  auto avg_both_time = sum_both_time / repeat_outer;
+
+  printf( "\"mempool: fill time (min, avg)\" %.8f %.8f\n"
+        , min_fill_time
+        , avg_fill_time );
+
+  printf( "\"mempool: cycle time (min, avg)\" %.8f %.8f\n"
+        , min_cycle_time
+        , avg_cycle_time );
+
+  printf( "\"mempool: test time (min, avg)\" %.8f %.8f\n"
+        , min_both_time
+        , avg_both_time );
+
+  printf( "\"mempool: fill ops per second (max, avg)\" %g %g\n"
+        , number_alloc / min_fill_time
+        , number_alloc / avg_fill_time );
+
+  printf( "\"mempool: cycle ops per second (max, avg)\" %g %g\n"
+        , (2 * number_alloc * repeat_inner) / min_cycle_time
+        , (2 * number_alloc * repeat_inner) / avg_cycle_time );
+}
+
diff --git a/packages/kokkos/core/perf_test/test_taskdag.cpp b/packages/kokkos/core/perf_test/test_taskdag.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00615cd3abe7fade5b43762aad94afd7fd1f07ad
--- /dev/null
+++ b/packages/kokkos/core/perf_test/test_taskdag.cpp
@@ -0,0 +1,284 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if ! defined( KOKKOS_ENABLE_TASKDAG ) || \
+    defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+
+int main()
+{
+  return 0 ;
+}
+
+#else
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <limits>
+
+#include <impl/Kokkos_Timer.hpp>
+
+using ExecSpace = Kokkos::DefaultExecutionSpace ;
+
+inline
+long eval_fib( long n )
+{
+  constexpr long mask = 0x03;
+
+  long fib[4] = { 0, 1, 0, 0 };
+
+  for ( long i = 2; i <= n; ++i ) {
+    fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ];
+  }
+
+  return fib[ n & mask ];
+}
+
+inline
+long fib_alloc_count( long n )
+{
+  constexpr long mask = 0x03;
+
+  long count[4] = { 1, 1, 0, 0 };
+
+  for ( long i = 2; i <= n; ++i ) {
+    count[ i & mask ] = 2 // this task plus the 'when_all' task
+                      + count[ ( i - 1 ) & mask ]
+                      + count[ ( i - 2 ) & mask ];
+  }
+
+  return count[ n & mask ];
+}
+
+template< class Space >
+struct TestFib {
+
+  using Scheduler   = Kokkos::TaskScheduler< Space > ;
+  using MemorySpace = typename Scheduler::memory_space ;
+  using MemberType  = typename Scheduler::member_type ;
+  using FutureType  = Kokkos::Future< long , Space > ;
+
+  typedef long value_type ;
+
+  Scheduler  sched ;
+  FutureType dep[2] ;
+  const value_type n ;
+
+  KOKKOS_INLINE_FUNCTION
+  TestFib( const Scheduler & arg_sched , const value_type arg_n )
+    : sched( arg_sched ), dep{} , n( arg_n ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const MemberType & , value_type & result ) noexcept
+    {
+      if ( n < 2 ) {
+        result = n ;
+      }
+      else if ( ! dep[0].is_null() && ! dep[1].is_null() ) {
+        result = dep[0].get() + dep[1].get();
+      }
+      else {
+        // Spawn new children and respawn myself to sum their results.
+        // Spawn lower value at higher priority as it has a shorter
+        // path to completion.
+
+        dep[1] = Kokkos::task_spawn
+          ( Kokkos::TaskSingle( sched, Kokkos::TaskPriority::High )
+          , TestFib( sched, n - 2 ) );
+
+        dep[0] = Kokkos::task_spawn
+          ( Kokkos::TaskSingle( sched )
+          , TestFib( sched, n - 1 ) );
+
+        Kokkos::Future< ExecSpace > fib_all = Kokkos::when_all( dep, 2 );
+
+        if ( ! dep[0].is_null() && ! dep[1].is_null() && ! fib_all.is_null() ) {
+          // High priority to retire this branch.
+          Kokkos::respawn( this, fib_all, Kokkos::TaskPriority::High );
+        }
+        else {
+          Kokkos::abort("Failed nested task spawn (allocation)");
+        }
+      }
+    }
+};
+
+
+
+int main( int argc , char* argv[] )
+{
+  static const char help[] = "--help" ;
+  static const char alloc_size[]   = "--alloc_size=" ;
+  static const char super_size[]   = "--super_size=" ;
+  static const char repeat_outer[] = "--repeat_outer=" ;
+  static const char input_value[]  = "--input=" ;
+
+  long total_alloc_size    = 1000000 ;
+  int  min_superblock_size =   10000 ;
+  int  test_repeat_outer   =       1 ;
+  int  fib_input           =       4 ;
+
+  int  ask_help = 0 ;
+
+  for(int i=1;i<argc;i++)
+  {
+     const char * const a = argv[i];
+
+     if ( ! strncmp(a,help,strlen(help) ) ) ask_help = 1 ;
+
+     if ( ! strncmp(a,alloc_size,strlen(alloc_size) ) )
+       total_alloc_size = atol( a + strlen(alloc_size) );
+
+     if ( ! strncmp(a,super_size,strlen(super_size) ) )
+       min_superblock_size = atoi( a + strlen(super_size) );
+
+     if ( ! strncmp(a,repeat_outer,strlen(repeat_outer) ) )
+       test_repeat_outer = atoi( a + strlen(repeat_outer) );
+
+     if ( ! strncmp(a,input_value,strlen(input_value) ) )
+       fib_input = atoi( a + strlen(input_value) );
+  }
+
+  const long fib_output   = eval_fib( fib_input );
+  const long number_alloc = fib_alloc_count( fib_input );
+
+  const unsigned min_block_size =  32 ;
+  const unsigned max_block_size = 128 ;
+
+  long task_count_max = 0 ;
+  long task_count_accum = 0 ;
+  long test_result = 0 ;
+
+  if ( ask_help ) {
+    std::cout << "command line options:"
+              << " " << help
+              << " " << alloc_size << "##"
+              << " " << super_size << "##"
+              << " " << input_value << "##"
+              << " " << repeat_outer << "##"
+              << std::endl ;
+    return -1;
+  }
+
+  typedef TestFib< ExecSpace >  Functor ;
+
+  Kokkos::initialize(argc,argv);
+
+  Functor::Scheduler sched( Functor::MemorySpace()
+                          , total_alloc_size
+                          , min_block_size
+                          , max_block_size
+                          , min_superblock_size
+                          );
+
+  Functor::FutureType f =
+    Kokkos::host_spawn( Kokkos::TaskSingle( sched )
+                      , Functor( sched , fib_input )
+                      );
+
+  Kokkos::wait( sched );
+
+  test_result = f.get();
+
+  task_count_max   = sched.allocated_task_count_max();
+  task_count_accum = sched.allocated_task_count_accum();
+
+  if ( number_alloc != task_count_accum ) {
+    std::cout << " number_alloc( " << number_alloc << " )"
+              << " != task_count_accum( " << task_count_accum << " )"
+              << std::endl ;
+  }
+
+  if ( fib_output != test_result ) {
+    std::cout << " answer( " << fib_output << " )"
+              << " != result( " << test_result << " )"
+              << std::endl ;
+  }
+
+  if ( fib_output != test_result || number_alloc != task_count_accum ) {
+    printf("  TEST FAILED\n");
+    return -1;
+  }
+
+  double min_time = std::numeric_limits<double>::max();
+  double time_sum = 0;
+
+  for ( int i = 0 ; i < test_repeat_outer ; ++i ) {
+    Kokkos::Impl::Timer timer ;
+
+    Functor::FutureType ftmp =
+      Kokkos::host_spawn( Kokkos::TaskSingle( sched )
+                        , Functor( sched , fib_input )
+                        );
+
+    Kokkos::wait( sched );
+    auto this_time = timer.seconds();
+    min_time = std::min(min_time, this_time);
+    time_sum += this_time;
+  }
+
+  auto avg_time = time_sum / test_repeat_outer;
+
+  Kokkos::finalize();
+
+  printf( "\"taskdag: alloc super repeat input output task-accum task-max\" %ld %d %d %d %ld %ld %ld\n"
+        , total_alloc_size
+        , min_superblock_size
+        , test_repeat_outer
+        , fib_input
+        , fib_output
+        , task_count_accum
+        , task_count_max );
+
+  printf( "\"taskdag: time (min, avg)\" %g %g\n", min_time, avg_time);
+  printf( "\"taskdag: tasks per second (max, avg)\" %g %g\n"
+        , number_alloc / min_time
+        , number_alloc / avg_time );
+
+  return 0 ;
+}
+
+#endif
+
diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1914b6ba96810773deca1acdaacd18fd15ccad1a
--- /dev/null
+++ b/packages/kokkos/core/src/CMakeLists.txt
@@ -0,0 +1,98 @@
+
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+#-----------------------------------------------------------------------------
+
+SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
+
+#-----------------------------------------------------------------------------
+
+IF(KOKKOS_LEGACY_TRIBITS)
+
+  ASSERT_DEFINED(${PROJECT_NAME}_ENABLE_CXX11)
+  ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUDA)
+
+  SET(HEADERS_PUBLIC "")
+  SET(HEADERS_PRIVATE "")
+  SET(SOURCES "")
+
+  FILE(GLOB HEADERS_PUBLIC Kokkos*.hpp)
+  LIST( APPEND HEADERS_PUBLIC ${CMAKE_BINARY_DIR}/${PACKAGE_NAME}_config.h )
+
+  #-----------------------------------------------------------------------------
+
+  FILE(GLOB HEADERS_IMPL impl/*.hpp)
+  FILE(GLOB SOURCES_IMPL impl/*.cpp)
+
+  LIST(APPEND HEADERS_PRIVATE ${HEADERS_IMPL} )
+  LIST(APPEND SOURCES         ${SOURCES_IMPL} )
+
+  INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/)
+
+  #-----------------------------------------------------------------------------
+
+  FILE(GLOB HEADERS_THREADS Threads/*.hpp)
+  FILE(GLOB SOURCES_THREADS Threads/*.cpp)
+
+  LIST(APPEND HEADERS_PRIVATE ${HEADERS_THREADS} )
+  LIST(APPEND SOURCES         ${SOURCES_THREADS} )
+
+  INSTALL(FILES ${HEADERS_THREADS} DESTINATION ${TRILINOS_INCDIR}/Threads/)
+
+  #-----------------------------------------------------------------------------
+
+  FILE(GLOB HEADERS_OPENMP OpenMP/*.hpp)
+  FILE(GLOB SOURCES_OPENMP OpenMP/*.cpp)
+
+  LIST(APPEND HEADERS_PRIVATE ${HEADERS_OPENMP} )
+  LIST(APPEND SOURCES         ${SOURCES_OPENMP} )
+
+  INSTALL(FILES ${HEADERS_OPENMP} DESTINATION ${TRILINOS_INCDIR}/OpenMP/)
+
+  #-----------------------------------------------------------------------------
+
+  FILE(GLOB HEADERS_CUDA Cuda/*.hpp)
+  FILE(GLOB SOURCES_CUDA Cuda/*.cpp)
+
+  LIST(APPEND HEADERS_PRIVATE ${HEADERS_CUDA} )
+  LIST(APPEND SOURCES         ${SOURCES_CUDA} )
+
+  INSTALL(FILES ${HEADERS_CUDA} DESTINATION ${TRILINOS_INCDIR}/Cuda/)
+
+  #-----------------------------------------------------------------------------
+  FILE(GLOB HEADERS_QTHREADS Qthreads/*.hpp)
+  FILE(GLOB SOURCES_QTHREADS Qthreads/*.cpp)
+
+  LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREADS} )
+  LIST(APPEND SOURCES         ${SOURCES_QTHREADS} )
+
+  INSTALL(FILES ${HEADERS_QTHREADS} DESTINATION ${TRILINOS_INCDIR}/Qthreads/)
+
+  TRIBITS_ADD_LIBRARY(
+      kokkoscore
+      HEADERS ${HEADERS_PUBLIC}
+      NOINSTALLHEADERS ${HEADERS_PRIVATE}
+      SOURCES ${SOURCES}
+      DEPLIBS
+      )
+
+#-----------------------------------------------------------------------------
+#  In the new build system, sources are calculated by Makefile.kokkos
+else()
+
+  INSTALL (DIRECTORY
+           "${CMAKE_CURRENT_SOURCE_DIR}/"
+           DESTINATION ${TRILINOS_INCDIR}
+           FILES_MATCHING PATTERN "*.hpp"
+  )
+
+  TRIBITS_ADD_LIBRARY(
+      kokkoscore
+      SOURCES ${KOKKOS_CORE_SRCS}
+      DEPLIBS
+      )
+
+endif()
+#-----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp b/packages/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..04c5cdccbfcd2d8faee9181f9cd6d8dd0be88702
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
@@ -0,0 +1,1263 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
+#define KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
+
+#include <iostream>
+#include <algorithm>
+#include <cstdio>
+
+#include <utility>
+
+//#include<Cuda/Kokkos_CudaExec.hpp>
+// Including the file above, leads to following type of errors:
+// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete type is not allowed
+// As a result, recreate cuda_parallel_launch and associated code
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+namespace Kokkos { namespace Impl {
+
+// ------------------------------------------------------------------ //
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch( const DriverType driver )
+{
+  driver();
+}
+
+template< class DriverType >
+struct CudaLaunch
+{
+  inline
+  CudaLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+            )
+  {
+    cuda_parallel_launch< DriverType ><<< grid , block >>>(driver);
+  }
+
+};
+
+// ------------------------------------------------------------------ //
+template< int N , typename RP , typename Functor , typename Tag >
+struct apply_impl;
+
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<2,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+  if (RP::inner_direction == RP::Left) {
+    for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+      if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+        for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+          if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+            m_func(offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+// LR
+  else {
+    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+            m_func(offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<2,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+  if (RP::inner_direction == RP::Left) {
+    // Loop over size maxnumblocks until full range covered
+    for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+      if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+        for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+          if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+            m_func(Tag(), offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+  else {
+    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+            m_func(Tag(), offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<3,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
+        if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+                if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+  else {
+    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+            for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+              const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
+              if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+                m_func(offset_0 , offset_1 , offset_2);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<3,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
+        if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+                if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<4,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
+        if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
+            if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<4,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
+        if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
+            if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<5,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
+        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<5,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
+        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<6,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z % numbl4;
+      const index_type tile_id5 = blockIdx.z / numbl4;
+      const index_type thr_id4 = threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z / numbl5;
+      const index_type tile_id5 = blockIdx.z % numbl5;
+      const index_type thr_id4 = threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<6,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z % numbl4;
+      const index_type tile_id5 = blockIdx.z / numbl4;
+      const index_type thr_id4 = threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z / numbl5;
+      const index_type tile_id5 = blockIdx.z % numbl5;
+      const index_type thr_id4 = threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// ----------------------------------------------------------------------------------
+
+template < typename RP
+         , typename Functor
+         , typename Tag
+         >
+struct DeviceIterateTile
+{
+  using index_type = typename RP::index_type;
+  using array_index_type = typename RP::array_index_type;
+  using point_type = typename RP::point_type;
+
+  struct VoidDummy {};
+  typedef typename std::conditional< std::is_same<Tag, void>::value, VoidDummy, Tag>::type usable_tag;
+
+  DeviceIterateTile( const RP & rp, const Functor & func )
+    : m_rp{rp}
+    , m_func{func}
+  {}
+
+private:
+  inline __device__
+  void apply() const
+  {
+    apply_impl<RP::rank,RP,Functor,Tag>(m_rp,m_func).exec_range();
+  } //end apply
+
+public:
+
+  inline
+  __device__
+  void operator()(void) const
+  {
+    this-> apply();
+  }
+
+  inline
+  void execute() const
+  {
+    const array_index_type maxblocks = 65535; //not true for blockIdx.x for newer archs
+    if ( RP::rank == 2 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
+      const dim3 grid(
+            std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+          , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+          , 1
+          );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 3 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
+      const dim3 grid(
+          std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+        , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 4 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 5 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 6 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        ,  std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
+                  , static_cast<index_type>(maxblocks) )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else
+    {
+      printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
+      Kokkos::abort("Aborting");
+    }
+
+  } //end execute
+
+protected:
+  const RP         m_rp;
+  const Functor    m_func;
+};
+
+} } //end namespace Kokkos::Impl
+
+#endif
+#endif
+
diff --git a/packages/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/packages/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..636e05c8acf29ed5cc23e52c2217a0573c610d26
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
@@ -0,0 +1,2715 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP
+#define KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+
+#include <iostream>
+#include <algorithm>
+#include <cstdio>
+
+#include <utility>
+
+// #include<Cuda/Kokkos_CudaExec.hpp>
+// Including the file above leads to following type of errors:
+// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete type is not allowed
+// use existing Kokkos functionality, e.g. max blocks, once resolved
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+namespace Kokkos { namespace Impl {
+
+namespace Refactor {
+
+// ------------------------------------------------------------------ //
+// ParallelFor iteration pattern
+template< int N , typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile;
+
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<2,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+        if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+          for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+            if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+              m_func(offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+              m_func(offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<2,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      // Loop over size maxnumblocks until full range covered
+      for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+        if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+          for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+            if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+              m_func(Tag(), offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+              m_func(Tag(), offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<3,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
+        if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+                if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+                  m_func(offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<3,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
+        if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+                if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<4,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
+        if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
+            if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<4,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
+        if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
+            if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<5,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
+        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<5,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
+        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<6,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
+      const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
+      const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
+      const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
+      const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<6,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
+      const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
+      const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
+      const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
+      const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+} // Refactor
+
+// ----------------------------------------------------------------------------------
+
+namespace Reduce {
+
+template < typename T >
+using is_void = std::is_same< T, void >;
+
+template < typename T >
+struct is_array_type : std::false_type
+{
+  using value_type = T;
+};
+
+template < typename T >
+struct is_array_type< T* > : std::true_type
+{
+  using value_type = T;
+};
+
+template < typename T >
+struct is_array_type< T[] > : std::true_type
+{
+  using value_type = T;
+};
+
+// ------------------------------------------------------------------ //
+template< int N , typename RP , typename Functor , typename Tag , typename ValueType , typename Enable = void >
+struct DeviceIterateTile;
+
+// ParallelReduce iteration pattern
+// Scalar reductions
+
+// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of tiles and reduction algorithm constraints
+// extract n-dim tile offsets (i.e. tile's global starting mulit-index) from the tileid = blockid using tile dimensions
+// local indices within a tile extracted from (index_type)threadIdx.x using tile dims, constrained by blocksize
+// combine tile and local id info for multi-dim global ids
+
+// Pattern:
+// Each block+thread is responsible for a tile+local_id combo (additional when striding by num_blocks)
+// 1. create offset arrays
+// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max num blocks)
+// 3. temps set for tile_idx and thrd_idx, which will be modified
+// 4. if LL vs LR:
+//      determine tile starting point offsets (multidim)
+//      determine local index offsets (multidim)
+//      concatentate tile offset + local offset for global multi-dim index
+//    if offset withinin range bounds AND local offset within tile bounds, call functor
+
+// ValueType = T
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            // Deduce this blocks tile_id
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          {
+            m_func( m_offset[0], m_offset[1], m_v );
+          }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// ValueType = T[], T*
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          {
+            m_func( m_offset[0], m_offset[1], m_v );
+          }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+      } //end for loop over num_tiles - product of tiles in each direction
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+} // Reduce
+
+// ----------------------------------------------------------------------------------
+
+} } //end namespace Kokkos::Impl
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ada3f64fe129703326d2aa00d56badd14d44406c
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -0,0 +1,419 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDAEXEC_HPP
+#define KOKKOS_CUDAEXEC_HPP
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <string>
+#include <cstdint>
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaTraits {
+  enum { WarpSize       = 32      /* 0x0020 */ };
+  enum { WarpIndexMask  = 0x001f  /* Mask for warpindex */ };
+  enum { WarpIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
+
+  enum { SharedMemoryBanks    = 32      /* Compute device 2.0 */ };
+  enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
+  enum { SharedMemoryUsage    = 0x04000 /* 16k shared / 48k L1 Cache */ };
+
+  enum { UpperBoundGridCount    = 65535 /* Hard upper bound */ };
+  enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
+  enum { ConstantMemoryUsage    = 0x008000 /* 32k bytes */ };
+  enum { ConstantMemoryCache    = 0x002000 /*  8k bytes */ };
+
+  typedef unsigned long
+    ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
+
+  enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_count( CudaSpace::size_type i )
+    { return ( i + WarpIndexMask ) >> WarpIndexShift ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_align( CudaSpace::size_type i )
+    {
+      enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
+      return ( i + WarpIndexMask ) & Mask ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+CudaSpace::size_type cuda_internal_multiprocessor_count();
+CudaSpace::size_type cuda_internal_maximum_warp_count();
+CudaSpace::size_type cuda_internal_maximum_grid_count();
+CudaSpace::size_type cuda_internal_maximum_shared_words();
+
+CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
+
+CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ )
+
+/** \brief  Access to constant memory on the device */
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+
+__device__ __constant__
+extern unsigned long kokkos_impl_cuda_constant_memory_buffer[] ;
+
+#else
+
+__device__ __constant__
+unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
+
+#endif
+
+namespace Kokkos {
+namespace Impl {
+  void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
+}
+}
+
+template< typename T >
+inline
+__device__
+T * kokkos_impl_cuda_shared_memory()
+{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize L1 cache and minimize shared memory:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
+// For 2.0 capability: 48 KB L1 and 16 KB shared
+//----------------------------------------------------------------------------
+
+template< class DriverType>
+__global__
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType>
+__global__
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template < class DriverType
+         , class LaunchBounds = Kokkos::LaunchBounds<>
+         , bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
+struct CudaParallelLaunch ;
+
+template < class DriverType
+         , unsigned int MaxThreadsPerBlock
+         , unsigned int MinBlocksPerSM >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds< MaxThreadsPerBlock 
+                                               , MinBlocksPerSM >
+                         , true >
+{
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      // Fence before changing settings and copying closure
+      Kokkos::Cuda::fence();
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_constant_memory
+                < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      // Copy functor to constant memory on the device
+      cudaMemcpyToSymbol(
+        kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_constant_memory
+        < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+          <<< grid , block , shmem , stream >>>();
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds<>
+                         , true >
+{
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      // Fence before changing settings and copying closure
+      Kokkos::Cuda::fence();
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_constant_memory< DriverType >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      // Copy functor to constant memory on the device
+      cudaMemcpyToSymbol(
+        kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_constant_memory< DriverType >
+          <<< grid , block , shmem , stream >>>();
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+template < class DriverType
+         , unsigned int MaxThreadsPerBlock
+         , unsigned int MinBlocksPerSM >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds< MaxThreadsPerBlock 
+                                               , MinBlocksPerSM >
+                         , false >
+{
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_local_memory
+                < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_local_memory
+        < DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
+          <<< grid , block , shmem , stream >>>( driver );
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds<>
+                         , false >
+{
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER
+      // On Kepler the L1 has no benefit since it doesn't cache reads
+      else {
+        CUDA_SAFE_CALL(
+          cudaFuncSetCacheConfig
+            ( cuda_parallel_launch_local_memory< DriverType >
+            , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
+            ) );
+      }
+      #endif
+
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_local_memory< DriverType >
+          <<< grid , block , shmem , stream >>>( driver );
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* defined( __CUDACC__ ) */
+#endif /* defined( KOKKOS_ENABLE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf1033c0914b2aaf3b8548882ca354c6a30edd8b
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -0,0 +1,832 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <algorithm>
+#include <atomic>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Cuda.hpp>
+#include <Kokkos_CudaSpace.hpp>
+
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#endif
+
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+namespace {
+
+  static std::atomic<int> num_uvm_allocations(0) ;
+
+   cudaStream_t get_deep_copy_stream() {
+     static cudaStream_t s = 0;
+     if( s == 0) {
+       cudaStreamCreate ( &s );
+     }
+     return s;
+   }
+}
+
+DeepCopy<CudaSpace,CudaSpace,Cuda>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<HostSpace,CudaSpace,Cuda>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<CudaSpace,HostSpace,Cuda>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<CudaSpace,CudaSpace,Cuda>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+DeepCopy<HostSpace,CudaSpace,Cuda>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+DeepCopy<CudaSpace,HostSpace,Cuda>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) {
+  cudaStream_t s = get_deep_copy_stream();
+  CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , s ) );
+  cudaStreamSynchronize(s);
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+
+namespace Kokkos {
+
+void CudaSpace::access_error()
+{
+  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+void CudaSpace::access_error( const void * const )
+{
+  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+
+/*--------------------------------------------------------------------------*/
+
+bool CudaUVMSpace::available()
+{
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
+  enum { UVM_available = true };
+#else
+  enum { UVM_available = false };
+#endif
+  return UVM_available;
+}
+
+/*--------------------------------------------------------------------------*/
+
+int CudaUVMSpace::number_of_allocations()
+{
+  return Kokkos::Impl::num_uvm_allocations.load();
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+CudaSpace::CudaSpace()
+  : m_device( Kokkos::Cuda().cuda_device() )
+{
+}
+
+CudaUVMSpace::CudaUVMSpace()
+  : m_device( Kokkos::Cuda().cuda_device() )
+{
+}
+
+CudaHostPinnedSpace::CudaHostPinnedSpace()
+{
+}
+
+void * CudaSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaMalloc( &ptr, arg_alloc_size ) );
+
+  return ptr ;
+}
+
+void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  enum { max_uvm_allocations = 65536 };
+
+  Cuda::fence();
+  if ( arg_alloc_size > 0 )
+  {
+    Kokkos::Impl::num_uvm_allocations++;
+
+    if ( Kokkos::Impl::num_uvm_allocations.load() > max_uvm_allocations ) {
+      Kokkos::Impl::throw_runtime_exception( "CudaUVM error: The maximum limit of UVM allocations exceeded (currently 65536)." ) ;
+    }
+
+    CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
+  }
+  Cuda::fence();
+
+  return ptr ;
+}
+
+void * CudaHostPinnedSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaHostAlloc( &ptr, arg_alloc_size , cudaHostAllocDefault ) );
+
+  return ptr ;
+}
+
+void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  try {
+    CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
+  } catch(...) {}
+}
+
+void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  Cuda::fence();
+  try {
+    if ( arg_alloc_ptr != nullptr ) {
+      Kokkos::Impl::num_uvm_allocations--;
+      CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
+    }
+  } catch(...) {}
+  Cuda::fence();
+}
+
+void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  try {
+    CUDA_SAFE_CALL( cudaFreeHost( arg_alloc_ptr ) );
+  } catch(...) {}
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record ;
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record ;
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record ;
+
+::cudaTextureObject_t
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+attach_texture_object( const unsigned sizeof_alias
+                     , void *   const alloc_ptr
+                     , size_t   const alloc_size )
+{
+  enum { TEXTURE_BOUND_1D = 1u << 27 };
+
+  if ( ( alloc_ptr == 0 ) || ( sizeof_alias * TEXTURE_BOUND_1D <= alloc_size ) ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::CudaSpace ERROR: Cannot attach texture object to"
+        << " alloc_ptr(" << alloc_ptr << ")"
+        << " alloc_size(" << alloc_size << ")"
+        << " max_size(" << ( sizeof_alias * TEXTURE_BOUND_1D ) << ")" ;
+    std::cerr << msg.str() << std::endl ;
+    std::cerr.flush();
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  ::cudaTextureObject_t tex_obj ;
+
+  struct cudaResourceDesc resDesc ;
+  struct cudaTextureDesc  texDesc ;
+
+  memset( & resDesc , 0 , sizeof(resDesc) );
+  memset( & texDesc , 0 , sizeof(texDesc) );
+
+  resDesc.resType                = cudaResourceTypeLinear ;
+  resDesc.res.linear.desc        = ( sizeof_alias ==  4 ?  cudaCreateChannelDesc< int >() :
+                                   ( sizeof_alias ==  8 ?  cudaCreateChannelDesc< ::int2 >() :
+                                  /* sizeof_alias == 16 */ cudaCreateChannelDesc< ::int4 >() ) );
+  resDesc.res.linear.sizeInBytes = alloc_size ;
+  resDesc.res.linear.devPtr      = alloc_ptr ;
+
+  CUDA_SAFE_CALL( cudaCreateTextureObject( & tex_obj , & resDesc, & texDesc, NULL ) );
+
+  return tex_obj ;
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaSpace , void >::get_label() const
+{
+  SharedAllocationHeader header ;
+
+  Kokkos::Impl::DeepCopy< Kokkos::HostSpace , Kokkos::CudaSpace >( & header , RecordBase::head() , sizeof(SharedAllocationHeader) );
+
+  return std::string( header.m_label );
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_label() const
+{
+  return std::string( RecordBase::head()->m_label );
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_label() const
+{
+  return std::string( RecordBase::head()->m_label );
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void > *
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+allocate( const Kokkos::CudaSpace &  arg_space
+        , const std::string       &  arg_label
+        , const size_t               arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void > *
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+allocate( const Kokkos::CudaUVMSpace &  arg_space
+        , const std::string          &  arg_label
+        , const size_t                  arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+allocate( const Kokkos::CudaHostPinnedSpace &  arg_space
+        , const std::string                 &  arg_label
+        , const size_t                         arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+~SharedAllocationRecord()
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+
+    SharedAllocationHeader header ;
+    Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>( & header , RecordBase::m_alloc_ptr , sizeof(SharedAllocationHeader) );
+
+    Kokkos::Profiling::deallocateData(
+      Kokkos::Profiling::SpaceHandle(Kokkos::CudaSpace::name()),header.m_label,
+      data(),size());
+  }
+  #endif
+
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+~SharedAllocationRecord()
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Cuda::fence(); //Make sure I can access the label ...
+    Kokkos::Profiling::deallocateData(
+      Kokkos::Profiling::SpaceHandle(Kokkos::CudaUVMSpace::name()),RecordBase::m_alloc_ptr->m_label,
+      data(),size());
+  }
+  #endif
+
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+~SharedAllocationRecord()
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::deallocateData(
+      Kokkos::Profiling::SpaceHandle(Kokkos::CudaHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
+      data(),size());
+  }
+  #endif
+
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_tex_obj( 0 )
+  , m_space( arg_space )
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
+  }
+  #endif
+
+  SharedAllocationHeader header ;
+
+  // Fill in the Header information
+  header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( header.m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+
+  // Copy to device memory
+  Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
+                      , const std::string          & arg_label
+                      , const size_t                 arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_tex_obj( 0 )
+  , m_space( arg_space )
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
+  }
+  #endif
+ // Fill in the Header information, directly accessible via UVM
+
+  RecordBase::m_alloc_ptr->m_record = this ;
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
+                      , const std::string                 & arg_label
+                      , const size_t                        arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
+  }
+  #endif
+  // Fill in the Header information, directly accessible via UVM
+
+  RecordBase::m_alloc_ptr->m_record = this ;
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+//----------------------------------------------------------------------------
+
+void * SharedAllocationRecord< Kokkos::CudaSpace , void >::
+allocate_tracked( const Kokkos::CudaSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::CudaSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::CudaSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<CudaSpace,CudaSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+void * SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+allocate_tracked( const Kokkos::CudaUVMSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<CudaUVMSpace,CudaUVMSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+void * SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+allocate_tracked( const Kokkos::CudaHostPinnedSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<CudaHostPinnedSpace,CudaHostPinnedSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+//----------------------------------------------------------------------------
+
+SharedAllocationRecord< Kokkos::CudaSpace , void > *
+SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
+{
+  using RecordBase = SharedAllocationRecord< void , void > ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;
+
+#if 0
+  using Header     = SharedAllocationHeader ;
+
+  // Copy the header from the allocation
+  Header head ;
+
+  Header const * const head_cuda = alloc_ptr ? Header::get_header( alloc_ptr ) : (Header*) 0 ;
+
+  if ( alloc_ptr ) {
+    Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>( & head , head_cuda , sizeof(SharedAllocationHeader) );
+  }
+
+  RecordCuda * const record = alloc_ptr ? static_cast< RecordCuda * >( head.m_record ) : (RecordCuda *) 0 ;
+
+  if ( ! alloc_ptr || record->m_alloc_ptr != head_cuda ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
+  }
+
+#else
+
+  // Iterate the list to search for the record among all allocations
+  // requires obtaining the root of the list and then locking the list.
+
+  RecordCuda * const record = static_cast< RecordCuda * >( RecordBase::find( & s_root_record , alloc_ptr ) );
+
+  if ( record == 0 ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
+  }
+
+#endif
+
+  return record ;
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void > *
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
+
+  Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
+
+  if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) );
+  }
+
+  return static_cast< RecordCuda * >( h->m_record );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > ;
+
+  Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
+
+  if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) );
+  }
+
+  return static_cast< RecordCuda * >( h->m_record );
+}
+
+// Iterate records to print orphaned memory ...
+void
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaSpace & , bool detail )
+{
+  SharedAllocationRecord< void , void > * r = & s_root_record ;
+
+  char buffer[256] ;
+
+  SharedAllocationHeader head ;
+
+  if ( detail ) {
+    do {
+      if ( r->m_alloc_ptr ) {
+        Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+      }
+      else {
+        head.m_label[0] = 0 ;
+      }
+
+      //Formatting dependent on sizeof(uintptr_t)
+      const char * format_string;
+
+      if (sizeof(uintptr_t) == sizeof(unsigned long)) {
+        format_string = "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
+      }
+      else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
+        format_string = "Cuda addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
+      }
+
+      snprintf( buffer , 256
+              , format_string
+              , reinterpret_cast<uintptr_t>( r )
+              , reinterpret_cast<uintptr_t>( r->m_prev )
+              , reinterpret_cast<uintptr_t>( r->m_next )
+              , reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
+              , r->m_alloc_size
+              , r->m_count
+              , reinterpret_cast<uintptr_t>( r->m_dealloc )
+              , head.m_label
+              );
+      s << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+  else {
+    do {
+      if ( r->m_alloc_ptr ) {
+
+        Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+
+        //Formatting dependent on sizeof(uintptr_t)
+        const char * format_string;
+
+        if (sizeof(uintptr_t) == sizeof(unsigned long)) {
+          format_string = "Cuda [ 0x%.12lx + %ld ] %s\n";
+        }
+        else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
+          format_string = "Cuda [ 0x%.12llx + %ld ] %s\n";
+        }
+
+        snprintf( buffer , 256
+                , format_string
+                , reinterpret_cast< uintptr_t >( r->data() )
+                , r->size()
+                , head.m_label
+                );
+      }
+      else {
+        snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" );
+      }
+      s << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaUVMSpace & , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail );
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
+}
+
+void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
+  static void* ptr = NULL;
+  static std::int64_t current_size = 0;
+  if(current_size == 0) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
+  }
+  if(bytes > current_size) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(ptr,current_size);
+  }
+  if((bytes < current_size) && (force_shrink)) {
+    current_size = bytes;
+    Kokkos::kokkos_free<Kokkos::CudaSpace>(ptr);
+    ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
+  }
+  return ptr;
+}
+
+} // namespace Impl
+} // namespace Kokkos
+#else
+void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
+#endif // KOKKOS_ENABLE_CUDA
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5726e02180d084fa91914ae736b9d0f9a6b44d2d
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
@@ -0,0 +1,180 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+#define KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <impl/Kokkos_Traits.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template< class DestructFunctor >
+SharedAllocationRecord *
+shared_allocation_record( Kokkos::CudaSpace const & arg_space
+                        , void *            const   arg_alloc_ptr
+                        , DestructFunctor   const & arg_destruct )
+{
+  SharedAllocationRecord * const record = SharedAllocationRecord::get_record( arg_alloc_ptr );
+
+  // assert: record != 0
+
+  // assert: sizeof(DestructFunctor) <= record->m_destruct_size
+
+  // assert: record->m_destruct_function == 0
+
+  DestructFunctor * const functor =
+    reinterpret_cast< DestructFunctor * >(
+    reinterpret_cast< uintptr_t >( record ) + sizeof(SharedAllocationRecord) );
+
+  new( functor ) DestructFunctor( arg_destruct );
+
+  record->m_destruct_functor = & shared_allocation_destroy< DestructFunctor > ;
+
+  return record ;
+}
+
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedUVMAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged UVM Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedHostAllocator
+/// does nothing when deallocate(ptr,size) is called
+class CudaUnmanagedHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Unmanaged Host Allocator";
+  }
+  // Unmanaged deallocate does nothing
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+};
+
+/// class CudaMallocAllocator
+class CudaMallocAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Malloc Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUVMAllocator
+class CudaUVMAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda UVM Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaHostAllocator
+class CudaHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Host Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_ENABLE_CUDA
+
+#endif // #ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..55c7c782fe3b66f94d446e2bed6c10881f1cf5b5
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@@ -0,0 +1,68 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ERROR_HPP
+#define KOKKOS_CUDA_ERROR_HPP
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+namespace Kokkos { namespace Impl {
+
+void cuda_device_synchronize();
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 );
+
+inline void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0)
+{
+  if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); }
+}
+
+#define CUDA_SAFE_CALL( call )  \
+	Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ )
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_ENABLE_CUDA
+#endif //KOKKOS_CUDA_ERROR_HPP
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..052c1d2482e04447562681381b1b20f994ee524e
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@@ -0,0 +1,826 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/*--------------------------------------------------------------------------*/
+/* Kokkos interfaces */
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <Kokkos_Core.hpp>
+
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+/*--------------------------------------------------------------------------*/
+/* Standard 'C' libraries */
+#include <cstdlib>
+
+/* Standard 'C++' libraries */
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+
+__device__ __constant__
+unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+namespace {
+
+__global__
+void query_cuda_kernel_arch( int * d_arch )
+{
+#if defined( __CUDA_ARCH__ )
+  *d_arch = __CUDA_ARCH__ ;
+#else
+  *d_arch = 0 ;
+#endif
+}
+
+/** Query what compute capability is actually launched to the device: */
+int cuda_kernel_arch()
+{
+  int * d_arch = 0 ;
+  cudaMalloc( (void **) & d_arch , sizeof(int) );
+  query_cuda_kernel_arch<<<1,1>>>( d_arch );
+  int arch = 0 ;
+  cudaMemcpy( & arch , d_arch , sizeof(int) , cudaMemcpyDefault );
+  cudaFree( d_arch );
+  return arch ;
+}
+
+#ifdef KOKKOS_ENABLE_CUDA_UVM
+bool cuda_launch_blocking()
+{
+  const char * env = getenv("CUDA_LAUNCH_BLOCKING");
+
+  if (env == 0) return false;
+
+  return atoi(env);
+}
+#endif
+
+}
+
+void cuda_device_synchronize()
+{
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+}
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
+{
+  std::ostringstream out ;
+  out << name << " error( " << cudaGetErrorName(e) << "): " << cudaGetErrorString(e);
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw_runtime_exception( out.str() );
+}
+
+//----------------------------------------------------------------------------
+// Some significant cuda device properties:
+//
+// cudaDeviceProp::name                : Text label for device
+// cudaDeviceProp::major               : Device major number
+// cudaDeviceProp::minor               : Device minor number
+// cudaDeviceProp::warpSize            : number of threads per warp
+// cudaDeviceProp::multiProcessorCount : number of multiprocessors
+// cudaDeviceProp::sharedMemPerBlock   : capacity of shared memory per block
+// cudaDeviceProp::totalConstMem       : capacity of constant memory
+// cudaDeviceProp::totalGlobalMem      : capacity of global memory
+// cudaDeviceProp::maxGridSize[3]      : maximum grid size
+
+//
+//  Section 4.4.2.4 of the CUDA Toolkit Reference Manual
+//
+// struct cudaDeviceProp {
+//   char name[256];
+//   size_t totalGlobalMem;
+//   size_t sharedMemPerBlock;
+//   int regsPerBlock;
+//   int warpSize;
+//   size_t memPitch;
+//   int maxThreadsPerBlock;
+//   int maxThreadsDim[3];
+//   int maxGridSize[3];
+//   size_t totalConstMem;
+//   int major;
+//   int minor;
+//   int clockRate;
+//   size_t textureAlignment;
+//   int deviceOverlap;
+//   int multiProcessorCount;
+//   int kernelExecTimeoutEnabled;
+//   int integrated;
+//   int canMapHostMemory;
+//   int computeMode;
+//   int concurrentKernels;
+//   int ECCEnabled;
+//   int pciBusID;
+//   int pciDeviceID;
+//   int tccDriver;
+//   int asyncEngineCount;
+//   int unifiedAddressing;
+//   int memoryClockRate;
+//   int memoryBusWidth;
+//   int l2CacheSize;
+//   int maxThreadsPerMultiProcessor;
+// };
+
+
+namespace {
+
+
+
+class CudaInternalDevices {
+public:
+  enum { MAXIMUM_DEVICE_COUNT = 64 };
+  struct cudaDeviceProp  m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
+  int                    m_cudaDevCount ;
+
+  CudaInternalDevices();
+
+  static const CudaInternalDevices & singleton();
+};
+
+CudaInternalDevices::CudaInternalDevices()
+{
+  // See 'cudaSetDeviceFlags' for host-device thread interaction
+  // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
+
+  CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
+
+  if(m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
+    Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
+  }
+  for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
+    CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
+  }
+}
+
+const CudaInternalDevices & CudaInternalDevices::singleton()
+{
+  static CudaInternalDevices self ; return self ;
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+class CudaInternal {
+private:
+
+  CudaInternal( const CudaInternal & );
+  CudaInternal & operator = ( const CudaInternal & );
+
+
+public:
+
+  typedef Cuda::size_type size_type ;
+
+  int         m_cudaDev ;
+  int         m_cudaArch ;
+  unsigned    m_multiProcCount ;
+  unsigned    m_maxWarpCount ;
+  unsigned    m_maxBlock ;
+  unsigned    m_maxSharedWords ;
+  uint32_t    m_maxConcurrency ;
+  size_type   m_scratchSpaceCount ;
+  size_type   m_scratchFlagsCount ;
+  size_type   m_scratchUnifiedCount ;
+  size_type   m_scratchUnifiedSupported ;
+  size_type   m_streamCount ;
+  size_type * m_scratchSpace ;
+  size_type * m_scratchFlags ;
+  size_type * m_scratchUnified ;
+  uint32_t  * m_scratchConcurrentBitset ;
+  cudaStream_t * m_stream ;
+
+  static int was_initialized;
+  static int was_finalized;
+
+  static CudaInternal & singleton();
+
+  int verify_is_initialized( const char * const label ) const ;
+
+  int is_initialized() const
+    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
+
+  void initialize( int cuda_device_id , int stream_count );
+  void finalize();
+
+  void print_configuration( std::ostream & ) const ;
+
+  ~CudaInternal();
+
+  CudaInternal()
+    : m_cudaDev( -1 )
+    , m_cudaArch( -1 )
+    , m_multiProcCount( 0 )
+    , m_maxWarpCount( 0 )
+    , m_maxBlock( 0 )
+    , m_maxSharedWords( 0 )
+    , m_maxConcurrency( 0 )
+    , m_scratchSpaceCount( 0 )
+    , m_scratchFlagsCount( 0 )
+    , m_scratchUnifiedCount( 0 )
+    , m_scratchUnifiedSupported( 0 )
+    , m_streamCount( 0 )
+    , m_scratchSpace( 0 )
+    , m_scratchFlags( 0 )
+    , m_scratchUnified( 0 )
+    , m_scratchConcurrentBitset( 0 )
+    , m_stream( 0 )
+    {}
+
+  size_type * scratch_space( const size_type size );
+  size_type * scratch_flags( const size_type size );
+  size_type * scratch_unified( const size_type size );
+};
+
+int CudaInternal::was_initialized = 0;
+int CudaInternal::was_finalized = 0;
+//----------------------------------------------------------------------------
+
+
+void CudaInternal::print_configuration( std::ostream & s ) const
+{
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+#if defined( KOKKOS_ENABLE_CUDA )
+    s << "macro  KOKKOS_ENABLE_CUDA      : defined" << std::endl ;
+#endif
+#if defined( CUDA_VERSION )
+    s << "macro  CUDA_VERSION          = " << CUDA_VERSION
+      << " = version " << CUDA_VERSION / 1000
+      << "." << ( CUDA_VERSION % 1000 ) / 10
+      << std::endl ;
+#endif
+
+  for ( int i = 0 ; i < dev_info.m_cudaDevCount ; ++i ) {
+    s << "Kokkos::Cuda[ " << i << " ] "
+      << dev_info.m_cudaProp[i].name
+      << " capability " << dev_info.m_cudaProp[i].major << "." << dev_info.m_cudaProp[i].minor
+      << ", Total Global Memory: " << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem)
+      << ", Shared Memory per Block: " << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
+    if ( m_cudaDev == i ) s << " : Selected" ;
+    s << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+CudaInternal::~CudaInternal()
+{
+  if ( m_stream ||
+       m_scratchSpace ||
+       m_scratchFlags ||
+       m_scratchUnified ||
+       m_scratchConcurrentBitset ) {
+    std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
+              << std::endl ;
+    std::cerr.flush();
+  }
+
+  m_cudaDev                 = -1 ;
+  m_cudaArch                = -1 ;
+  m_multiProcCount          = 0 ;
+  m_maxWarpCount            = 0 ;
+  m_maxBlock                = 0 ;
+  m_maxSharedWords          = 0 ;
+  m_maxConcurrency          = 0 ;
+  m_scratchSpaceCount       = 0 ;
+  m_scratchFlagsCount       = 0 ;
+  m_scratchUnifiedCount     = 0 ;
+  m_scratchUnifiedSupported = 0 ;
+  m_streamCount             = 0 ;
+  m_scratchSpace            = 0 ;
+  m_scratchFlags            = 0 ;
+  m_scratchUnified          = 0 ;
+  m_scratchConcurrentBitset = 0 ;
+  m_stream                  = 0 ;
+}
+
+int CudaInternal::verify_is_initialized( const char * const label ) const
+{
+  if ( m_cudaDev < 0 ) {
+    std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized" << std::endl ;
+  }
+  return 0 <= m_cudaDev ;
+}
+
+CudaInternal & CudaInternal::singleton()
+{
+  static CudaInternal self ;
+  return self ;
+}
+
+void CudaInternal::initialize( int cuda_device_id , int stream_count )
+{
+  if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
+  was_initialized = 1;
+  if ( is_initialized() ) return;
+
+  enum { WordSize = sizeof(size_type) };
+
+  if ( ! HostSpace::execution_space::is_initialized() ) {
+    const std::string msg("Cuda::initialize ERROR : HostSpace::execution_space is not initialized");
+    throw_runtime_exception( msg );
+  }
+
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+  const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
+
+  const bool ok_id   = 0 <= cuda_device_id &&
+                            cuda_device_id < dev_info.m_cudaDevCount ;
+
+  // Need device capability 3.0 or better
+
+  const bool ok_dev = ok_id &&
+    ( 3 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
+      0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
+
+  if ( ok_init && ok_dev ) {
+
+    const struct cudaDeviceProp & cudaProp =
+      dev_info.m_cudaProp[ cuda_device_id ];
+
+    m_cudaDev = cuda_device_id ;
+
+    CUDA_SAFE_CALL( cudaSetDevice( m_cudaDev ) );
+    Kokkos::Impl::cuda_device_synchronize();
+
+    // Query what compute capability architecture a kernel executes:
+    m_cudaArch = cuda_kernel_arch();
+
+    int compiled_major = m_cudaArch / 100;
+    int compiled_minor = ( m_cudaArch % 100 ) / 10;
+
+    if ( compiled_major < 5 && cudaProp.major >= 5 ) {
+      std::stringstream ss;
+      ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for compute capability "
+         << compiled_major << "." << compiled_minor
+         << " (< 5.0) on device with compute capability "
+         << cudaProp.major << "." << cudaProp.minor
+         << " (>=5.0), this would give incorrect results!"
+         << std::endl ;
+      std::string msg = ss.str();
+      Kokkos::abort( msg.c_str() );
+    }
+    if ( Kokkos::show_warnings() && (compiled_major != cudaProp.major || compiled_minor != cudaProp.minor) ) {
+      std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
+                << compiled_major << "." << compiled_minor
+                << " on device with compute capability "
+                << cudaProp.major << "." << cudaProp.minor
+                << " , this will likely reduce potential performance."
+                << std::endl ;
+    }
+
+    // number of multiprocessors
+
+    m_multiProcCount = cudaProp.multiProcessorCount ;
+
+    //----------------------------------
+    // Maximum number of warps,
+    // at most one warp per thread in a warp for reduction.
+
+    m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ;
+
+    if ( Impl::CudaTraits::WarpSize < m_maxWarpCount ) {
+      m_maxWarpCount = Impl::CudaTraits::WarpSize ;
+    }
+
+    m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize ;
+
+    //----------------------------------
+    // Maximum number of blocks:
+
+    m_maxBlock = cudaProp.maxGridSize[0] ;
+
+    //----------------------------------
+
+    m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
+
+    if ( Kokkos::show_warnings() && ! m_scratchUnifiedSupported ) {
+      std::cout << "Kokkos::Cuda device "
+                << cudaProp.name << " capability "
+                << cudaProp.major << "." << cudaProp.minor
+                << " does not support unified virtual address space"
+                << std::endl ;
+    }
+
+    //----------------------------------
+    // Multiblock reduction uses scratch flags for counters
+    // and scratch space for partial reduction values.
+    // Allocate some initial space.  This will grow as needed.
+
+    {
+      const unsigned reduce_block_count = m_maxWarpCount * Impl::CudaTraits::WarpSize ;
+
+      (void) scratch_unified( 16 * sizeof(size_type) );
+      (void) scratch_flags( reduce_block_count * 2  * sizeof(size_type) );
+      (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
+    }
+    //----------------------------------
+    // Concurrent bitset for obtaining unique tokens from within
+    // an executing kernel.
+    {
+      const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0
+
+      m_maxConcurrency =
+        max_threads_per_sm * cudaProp.multiProcessorCount ;
+
+      const int32_t buffer_bound =
+         Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency );
+
+      // Allocate and initialize uint32_t[ buffer_bound ]
+
+      typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
+
+      Record * const r = Record::allocate( Kokkos::CudaSpace()
+                                         , "InternalScratchBitset"
+                                         , sizeof(uint32_t) * buffer_bound );
+
+      Record::increment( r );
+
+      m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>( r->data() );
+
+      CUDA_SAFE_CALL( cudaMemset( m_scratchConcurrentBitset , 0 , sizeof(uint32_t) * buffer_bound ) );
+
+    }
+    //----------------------------------
+
+    if ( stream_count ) {
+      m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
+      m_streamCount = stream_count ;
+      for ( size_type i = 0 ; i < m_streamCount ; ++i ) m_stream[i] = 0 ;
+    }
+  }
+  else {
+
+    std::ostringstream msg ;
+    msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED" ;
+
+    if ( ! ok_init ) {
+      msg << " : Already initialized" ;
+    }
+    if ( ! ok_id ) {
+      msg << " : Device identifier out of range "
+          << "[0.." << dev_info.m_cudaDevCount << "]" ;
+    }
+    else if ( ! ok_dev ) {
+      msg << " : Device " ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
+      msg << "." ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
+      msg << " has insufficient capability, required 3.0 or better" ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  #ifdef KOKKOS_ENABLE_CUDA_UVM
+    if( Kokkos::show_warnings() && !cuda_launch_blocking() ) {
+      std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
+      std::cout << "                                  without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
+      std::cout << "                                  The code must call Cuda::fence() after each kernel" << std::endl;
+      std::cout << "                                  or will likely crash when accessing data on the host." << std::endl;
+    }
+
+    const char * env_force_device_alloc = getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC");
+    bool force_device_alloc;
+    if (env_force_device_alloc == 0) force_device_alloc=false;
+    else force_device_alloc=atoi(env_force_device_alloc)!=0;
+
+    const char * env_visible_devices = getenv("CUDA_VISIBLE_DEVICES");
+    bool visible_devices_one=true;
+    if (env_visible_devices == 0) visible_devices_one=false;
+
+    if( Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc) ) {
+      std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
+      std::cout << "                                  without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl;
+      std::cout << "                                  setting CUDA_VISIBLE_DEVICES." << std::endl;
+      std::cout << "                                  This could on multi GPU systems lead to severe performance" << std::endl;
+      std::cout << "                                  penalties." << std::endl;
+    }
+  #endif
+
+  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::initialize_host_cuda_lock_arrays();
+}
+
+//----------------------------------------------------------------------------
+
+typedef Cuda::size_type ScratchGrain[ Impl::CudaTraits::WarpSize ] ;
+enum { sizeScratchGrain = sizeof(ScratchGrain) };
+
+
+Cuda::size_type *
+CudaInternal::scratch_flags( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
+
+
+    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
+
+    Record * const r = Record::allocate( Kokkos::CudaSpace()
+                                       , "InternalScratchFlags"
+                                       , ( sizeof( ScratchGrain ) * m_scratchFlagsCount ) );
+
+    Record::increment( r );
+
+    m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
+
+    CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
+  }
+
+  return m_scratchFlags ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_space( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
+
+    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+     typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
+
+     Record * const r = Record::allocate( Kokkos::CudaSpace()
+                                        , "InternalScratchSpace"
+                                        , ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
+
+     Record::increment( r );
+
+     m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
+  }
+
+  return m_scratchSpace ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_unified( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_unified") &&
+       m_scratchUnifiedSupported && m_scratchUnifiedCount * sizeScratchGrain < size ) {
+
+    m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;
+
+    Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
+                                       , "InternalScratchUnified"
+                                       , ( sizeof( ScratchGrain ) * m_scratchUnifiedCount ) );
+
+    Record::increment( r );
+
+    m_scratchUnified = reinterpret_cast<size_type *>( r->data() );
+  }
+
+  return m_scratchUnified ;
+}
+
+//----------------------------------------------------------------------------
+
+void CudaInternal::finalize()
+{
+  was_finalized = 1;
+  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
+
+    Impl::finalize_host_cuda_lock_arrays();
+
+    if ( m_stream ) {
+      for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
+        cudaStreamDestroy( m_stream[i] );
+        m_stream[i] = 0 ;
+      }
+      ::free( m_stream );
+    }
+
+    typedef Kokkos::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
+    typedef Kokkos::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
+
+    RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) );
+    RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
+    RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
+    RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) );
+
+    m_cudaDev             = -1 ;
+    m_multiProcCount      = 0 ;
+    m_maxWarpCount        = 0 ;
+    m_maxBlock            = 0 ;
+    m_maxSharedWords      = 0 ;
+    m_scratchSpaceCount   = 0 ;
+    m_scratchFlagsCount   = 0 ;
+    m_scratchUnifiedCount = 0 ;
+    m_streamCount         = 0 ;
+    m_scratchSpace        = 0 ;
+    m_scratchFlags        = 0 ;
+    m_scratchUnified      = 0 ;
+    m_scratchConcurrentBitset = 0 ;
+    m_stream              = 0 ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+Cuda::size_type cuda_internal_multiprocessor_count()
+{ return CudaInternal::singleton().m_multiProcCount ; }
+
+CudaSpace::size_type cuda_internal_maximum_concurrent_block_count()
+{
+  // Compute capability 5.0 through 6.2
+  enum : int { max_resident_blocks_per_multiprocessor = 32 };
+
+   return CudaInternal::singleton().m_multiProcCount
+          * max_resident_blocks_per_multiprocessor ;
+};
+
+Cuda::size_type cuda_internal_maximum_warp_count()
+{ return CudaInternal::singleton().m_maxWarpCount ; }
+
+Cuda::size_type cuda_internal_maximum_grid_count()
+{ return CudaInternal::singleton().m_maxBlock ; }
+
+Cuda::size_type cuda_internal_maximum_shared_words()
+{ return CudaInternal::singleton().m_maxSharedWords ; }
+
+Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_space( size ); }
+
+Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_flags( size ); }
+
+Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_unified( size ); }
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+Cuda::size_type Cuda::detect_device_count()
+{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
+
+int Cuda::concurrency()
+{ return Impl::CudaInternal::singleton().m_maxConcurrency ; }
+
+int Cuda::is_initialized()
+{ return Impl::CudaInternal::singleton().is_initialized(); }
+
+void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
+{
+  Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
+
+std::vector<unsigned>
+Cuda::detect_device_arch()
+{
+  const Impl::CudaInternalDevices & s = Impl::CudaInternalDevices::singleton();
+
+  std::vector<unsigned> output( s.m_cudaDevCount );
+
+  for ( int i = 0 ; i < s.m_cudaDevCount ; ++i ) {
+    output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor ;
+  }
+
+  return output ;
+}
+
+Cuda::size_type Cuda::device_arch()
+{
+  const int dev_id = Impl::CudaInternal::singleton().m_cudaDev ;
+
+  int dev_arch = 0 ;
+
+  if ( 0 <= dev_id ) {
+    const struct cudaDeviceProp & cudaProp =
+      Impl::CudaInternalDevices::singleton().m_cudaProp[ dev_id ] ;
+
+    dev_arch = cudaProp.major * 100 + cudaProp.minor ;
+  }
+
+  return dev_arch ;
+}
+
+void Cuda::finalize()
+{
+  Impl::CudaInternal::singleton().finalize();
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
+}
+
+Cuda::Cuda()
+  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
+  , m_stream( 0 )
+{
+  Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
+}
+
+Cuda::Cuda( const int instance_id )
+  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
+  , m_stream(
+      Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" )
+        ? Impl::CudaInternal::singleton().m_stream[ instance_id % Impl::CudaInternal::singleton().m_streamCount ]
+        : 0 )
+{}
+
+void Cuda::print_configuration( std::ostream & s , const bool )
+{ Impl::CudaInternal::singleton().print_configuration( s ); }
+
+bool Cuda::sleep() { return false ; }
+
+bool Cuda::wake() { return true ; }
+
+void Cuda::fence()
+{
+  Kokkos::Impl::cuda_device_synchronize();
+}
+
+const char* Cuda::name() { return "Cuda"; }
+
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+UniqueToken< Kokkos::Cuda , Kokkos::Experimental::UniqueTokenScope::Global >::
+UniqueToken( Kokkos::Cuda const & )
+  : m_buffer( Kokkos::Impl::CudaInternal::singleton().m_scratchConcurrentBitset )
+  , m_count(  Kokkos::Impl::CudaInternal::singleton().m_maxConcurrency )
+  {}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#else
+
+void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
+
+#endif // KOKKOS_ENABLE_CUDA
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..31f405dd840f2a49e784d1ec0653a67a593a13f9
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@@ -0,0 +1,201 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_INTERNAL_HPP
+#define KOKKOS_CUDA_INTERNAL_HPP
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include<iostream>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+namespace Kokkos { namespace Impl {
+
+template<class DriverType, bool Large>
+struct CudaGetMaxBlockSize;
+
+template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
+int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+  return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
+}
+
+
+template<class DriverType>
+struct CudaGetMaxBlockSize<DriverType,true> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int numBlocks;
+    int blockSize=32;
+    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocks,
+        cuda_parallel_launch_constant_memory<DriverType>,
+        blockSize,
+        sharedmem);
+
+    while (blockSize<1024 && numBlocks>0) {
+      blockSize*=2;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &numBlocks,
+          cuda_parallel_launch_constant_memory<DriverType>,
+          blockSize,
+          sharedmem);
+    }
+    if(numBlocks>0) return blockSize;
+    else return blockSize/2;
+  }
+};
+
+template<class DriverType>
+struct CudaGetMaxBlockSize<DriverType,false> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int numBlocks;
+
+    int blockSize=32;
+    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocks,
+        cuda_parallel_launch_local_memory<DriverType>,
+        blockSize,
+        sharedmem);
+
+    while (blockSize<1024 && numBlocks>0) {
+      blockSize*=2;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &numBlocks,
+          cuda_parallel_launch_local_memory<DriverType>,
+          blockSize,
+          sharedmem);
+    }
+    if(numBlocks>0) return blockSize;
+    else return blockSize/2;
+  }
+};
+
+
+
+template<class DriverType, bool Large>
+struct CudaGetOptBlockSize;
+
+template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
+int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+  return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
+}
+
+template<class DriverType>
+struct CudaGetOptBlockSize<DriverType,true> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int blockSize=16;
+    int numBlocks;
+    int sharedmem;
+    int maxOccupancy=0;
+    int bestBlockSize=0;
+
+    while(blockSize<1024) {
+      blockSize*=2;
+
+      //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+              &numBlocks,
+              cuda_parallel_launch_constant_memory<DriverType>,
+              blockSize,
+              sharedmem);
+      if(maxOccupancy < numBlocks*blockSize) {
+         maxOccupancy = numBlocks*blockSize;
+         bestBlockSize = blockSize;
+      }
+    }
+    return bestBlockSize;
+  }
+};
+
+template<class DriverType>
+struct CudaGetOptBlockSize<DriverType,false> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int blockSize=16;
+    int numBlocks;
+    int sharedmem;
+    int maxOccupancy=0;
+    int bestBlockSize=0;
+
+    while(blockSize<1024) {
+      blockSize*=2;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+              &numBlocks,
+              cuda_parallel_launch_local_memory<DriverType>,
+              blockSize,
+              sharedmem);
+
+      if(maxOccupancy < numBlocks*blockSize) {
+        maxOccupancy = numBlocks*blockSize;
+        bestBlockSize = blockSize;
+      }
+    }
+    return bestBlockSize;
+  }
+};
+
+}} // namespace Kokkos::Impl
+
+#endif // KOKKOS_ENABLE_CUDA
+#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1828ea2de75926a912ed2dea8a8aaa8ea515543
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Kokkos_Cuda.hpp>
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+namespace Kokkos {
+namespace Impl {
+__device__ __constant__
+CudaLockArrays g_device_cuda_lock_arrays = { nullptr, nullptr, 0 };
+}
+}
+#endif
+
+namespace Kokkos {
+
+namespace {
+
+__global__ void init_lock_array_kernel_atomic() {
+  unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i<CUDA_SPACE_ATOMIC_MASK+1) {
+    Kokkos::Impl::g_device_cuda_lock_arrays.atomic[i] = 0;
+  }
+}
+
+__global__ void init_lock_array_kernel_threadid(int N) {
+  unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i<(unsigned)N) {
+    Kokkos::Impl::g_device_cuda_lock_arrays.scratch[i] = 0;
+  }
+}
+
+} // namespace
+
+namespace Impl {
+
+CudaLockArrays g_host_cuda_lock_arrays = { nullptr, nullptr, 0 };
+
+void initialize_host_cuda_lock_arrays() {
+  if (g_host_cuda_lock_arrays.atomic != nullptr) return;
+  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
+                 sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1)));
+  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
+                 sizeof(int)*(Cuda::concurrency())));
+  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  g_host_cuda_lock_arrays.n = Cuda::concurrency();
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+1+255)/256,256>>>();
+  init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
+  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+}
+
+void finalize_host_cuda_lock_arrays() {
+  if (g_host_cuda_lock_arrays.atomic == nullptr) return;
+  cudaFree(g_host_cuda_lock_arrays.atomic);
+  g_host_cuda_lock_arrays.atomic = nullptr;
+  cudaFree(g_host_cuda_lock_arrays.scratch);
+  g_host_cuda_lock_arrays.scratch = nullptr;
+  g_host_cuda_lock_arrays.n = 0;
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#else
+
+void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {}
+
+#endif
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..80192bf338a9cfef23729de5b2c6896cc8bb7663
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@@ -0,0 +1,179 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_LOCKS_HPP
+#define KOKKOS_CUDA_LOCKS_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <cstdint>
+
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaLockArrays {
+  std::int32_t* atomic;
+  std::int32_t* scratch;
+  std::int32_t n;
+};
+
+/// \brief This global variable in Host space is the central definition
+///        of these arrays.
+extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays ;
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+void initialize_host_cuda_lock_arrays();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+void finalize_host_cuda_lock_arrays();
+
+} // namespace Impl
+} // namespace Kokkos
+
+#if defined( __CUDACC__ )
+
+namespace Kokkos {
+namespace Impl {
+
+/// \brief This global variable in CUDA space is what kernels use
+///        to get access to the lock arrays.
+///
+/// When relocatable device code is enabled, there can be one single
+/// instance of this global variable for the entire executable,
+/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
+/// here must then be extern.
+/// This one instance will be initialized by initialize_host_cuda_lock_arrays
+/// and need not be modified afterwards.
+///
+/// When relocatable device code is disabled, an instance of this variable
+/// will be created in every translation unit that sees this header file
+/// (we make this clear by marking it static, meaning no other translation
+///  unit can link to it).
+/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
+/// instances in other translation units, we must update this CUDA global
+/// variable based on the Host global variable prior to running any kernels
+/// that will use it.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
+__device__
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+__constant__
+extern
+#endif
+Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays ;
+
+#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline
+bool lock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset],0,1));
+}
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+__device__ inline
+void unlock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch( &Kokkos::Impl::g_device_cuda_lock_arrays.atomic[ offset ], 0);
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+// Make lock_array_copied an explicit translation unit scope thingy
+namespace Kokkos {
+namespace Impl {
+namespace {
+  static int lock_array_copied = 0;
+}
+}
+}
+/* Dan Ibanez: it is critical that this code be a macro, so that it will
+   capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
+   putting this in an inline function will NOT do the right thing! */
+#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
+{ \
+  if(::Kokkos::Impl::lock_array_copied == 0) { \
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol( \
+        Kokkos::Impl::g_device_cuda_lock_arrays , \
+        & Kokkos::Impl::g_host_cuda_lock_arrays , \
+        sizeof(Kokkos::Impl::CudaLockArrays) ) ); \
+  } \
+  lock_array_copied = 1; \
+  \
+}
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* defined( KOKKOS_ENABLE_CUDA ) */
+
+#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e417544fd8a599477d9828a871846e68dcf240f3
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -0,0 +1,2213 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_PARALLEL_HPP
+#define KOKKOS_CUDA_PARALLEL_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+
+#include <iostream>
+#include <algorithm>
+#include <cstdio>
+#include <cstdint>
+
+#include <utility>
+#include <Kokkos_Parallel.hpp>
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
+#include <Kokkos_Vectorization.hpp>
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Cuda , Properties ... >: public PolicyTraits<Properties ... >
+{
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+private:
+
+  enum { MAX_WARP = 8 };
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_vector_length ;
+  int m_team_scratch_size[2] ;
+  int m_thread_scratch_size[2] ;
+  int m_chunk_size;
+
+public:
+
+  //! Execution space of this execution policy
+  typedef Kokkos::Cuda  execution_space ;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_size = p.m_team_size;
+    m_vector_length = p.m_vector_length;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & functor )
+    {
+      int n = MAX_WARP * Impl::CudaTraits::WarpSize ;
+
+      for ( ; n ; n >>= 1 ) {
+        const int shmem_size =
+          /* for global reduce */ Impl::cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,typename traits::work_tag>( functor , n )
+          /* for team   reduce */ + ( n + 2 ) * sizeof(double)
+          /* for team   shared */ + Impl::FunctorTeamShmemSize< FunctorType >::value( functor , n );
+
+        if ( shmem_size < Impl::CudaTraits::SharedMemoryCapacity ) break ;
+      }
+
+      return n ;
+    }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & functor )
+    { return team_size_max( functor ); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & functor , const int vector_length)
+    {
+      int max = team_size_max( functor )/vector_length;
+      if(max<1) max = 1;
+      return max;
+    }
+
+  inline static
+  int vector_length_max()
+    { return Impl::CudaTraits::WarpSize; }
+
+  //----------------------------------------
+
+  inline int vector_length()   const { return m_vector_length ; }
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+  inline int scratch_size(int level, int team_size_ = -1) const {
+    if(team_size_<0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level];
+  }
+  inline int team_scratch_size(int level) const {
+    return m_team_scratch_size[level];
+  }
+  inline int thread_scratch_size(int level) const {
+    return m_thread_scratch_size[level];
+  }
+
+  TeamPolicyInternal()
+    : m_league_size( 0 )
+    , m_team_size( 0 )
+    , m_vector_length( 0 )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 32 )
+   {}
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( execution_space &
+            , int league_size_
+            , int team_size_request
+            , int vector_length_request = 1 )
+    : m_league_size( league_size_ )
+    , m_team_size( team_size_request )
+    , m_vector_length( vector_length_request )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 32 )
+    {
+      // Allow only power-of-two vector_length
+      if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
+        Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
+      }
+
+      // Make sure league size is permissable
+      if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
+        Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+
+      // Make sure total block size is permissable
+      if ( m_team_size * m_vector_length > 1024 ) {
+        Impl::throw_runtime_exception(std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. Team size x vector length must be smaller than 1024."));
+      }
+    }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( execution_space &
+            , int league_size_
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int vector_length_request = 1 )
+    : m_league_size( league_size_ )
+    , m_team_size( -1 )
+    , m_vector_length( vector_length_request )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 32 )
+    {
+      // Allow only power-of-two vector_length
+      if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
+        Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
+      }
+
+      // Make sure league size is permissable
+      if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
+        Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+    }
+
+  TeamPolicyInternal( int league_size_
+            , int team_size_request
+            , int vector_length_request = 1 )
+    : m_league_size( league_size_ )
+    , m_team_size( team_size_request )
+    , m_vector_length ( vector_length_request )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 32 )
+    {
+      // Allow only power-of-two vector_length
+      if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
+        Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
+      }
+
+      // Make sure league size is permissable
+      if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
+        Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+
+      // Make sure total block size is permissable
+      if ( m_team_size * m_vector_length > 1024 ) {
+        Impl::throw_runtime_exception(std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. Team size x vector length must be smaller than 1024."));
+      }
+    }
+
+  TeamPolicyInternal( int league_size_
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int vector_length_request = 1 )
+    : m_league_size( league_size_ )
+    , m_team_size( -1 )
+    , m_vector_length ( vector_length_request )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 32 )
+    {
+      // Allow only power-of-two vector_length
+      if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
+        Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
+      }
+
+      // Make sure league size is permissable
+      if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
+        Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+    }
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  typedef Kokkos::Impl::CudaTeamMember member_type ;
+
+protected:
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal internal_set_chunk_size(typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerThreadValue& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) {
+    m_team_scratch_size[level] = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+};
+
+} // namspace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Traits ... >
+                 , Kokkos::Cuda
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds LaunchBounds ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+
+  ParallelFor() = delete ;
+  ParallelFor & operator = ( const ParallelFor & ) = delete ;
+
+  template< class TagType >
+  inline __device__
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const Member i ) const
+    { m_functor( i ); }
+
+  template< class TagType >
+  inline __device__
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const Member i ) const
+    { m_functor( TagType() , i ); }
+
+public:
+
+  typedef FunctorType functor_type ;
+
+  inline
+  __device__
+  void operator()(void) const
+    {
+      const Member work_stride = blockDim.y * gridDim.x ;
+      const Member work_end    = m_policy.end();
+
+      for ( Member
+              iwork =  m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x ;
+              iwork <  work_end ;
+              iwork += work_stride ) {
+        this-> template exec_range< WorkTag >( iwork );
+      }
+    }
+
+  inline
+  void execute() const
+    {
+      const int nwork = m_policy.end() - m_policy.begin();
+      const int block_size = Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( m_functor , 1, 0 , 0 );
+      const dim3 block(  1 , block_size , 1);
+      const dim3 grid( std::min( ( nwork + block.y - 1 ) / block.y , cuda_internal_maximum_grid_count() ) , 1 , 1);
+
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+
+  ParallelFor( const FunctorType  & arg_functor ,
+               const Policy       & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    { }
+};
+
+
+// MDRangePolicy impl
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::MDRangePolicy< Traits ... >
+                 , Kokkos::Cuda
+                 >
+{
+private:
+  typedef Kokkos::MDRangePolicy< Traits ...  > Policy ;
+  using RP = Policy;
+  typedef typename Policy::array_index_type array_index_type;
+  typedef typename Policy::index_type index_type;
+  typedef typename Policy::launch_bounds LaunchBounds;
+
+
+  const FunctorType m_functor ;
+  const Policy      m_rp ;
+
+public:
+
+  inline
+  __device__
+  void operator()(void) const
+    {
+      Kokkos::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
+    }
+
+
+  inline
+  void execute() const
+  {
+    const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+    if ( RP::rank == 2 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
+      const dim3 grid(
+            std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+          , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+          , 1
+          );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 3 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
+      const dim3 grid(
+          std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+        , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 4 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 5 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 6 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        ,  std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
+                  , static_cast<index_type>(maxblocks) )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else
+    {
+      printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
+      Kokkos::abort("Aborting");
+    }
+
+  } //end execute
+
+//  inline
+  ParallelFor( const FunctorType & arg_functor
+             , Policy arg_policy )
+    : m_functor( arg_functor )
+    , m_rp(  arg_policy )
+    {}
+};
+
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , Kokkos::TeamPolicy< Properties ... >
+                 , Kokkos::Cuda
+                 >
+{
+private:
+
+  typedef TeamPolicyInternal< Kokkos::Cuda , Properties ... >   Policy ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds  LaunchBounds ;
+
+public:
+
+  typedef FunctorType      functor_type ;
+  typedef Cuda::size_type  size_type ;
+
+private:
+
+  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
+  // shared memory utilization:
+  //
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+  //
+
+  const FunctorType  m_functor ;
+  const size_type    m_league_size ;
+  const size_type    m_team_size ;
+  const size_type    m_vector_size ;
+  const int m_shmem_begin ;
+  const int m_shmem_size ;
+  void*              m_scratch_ptr[2] ;
+  const int m_scratch_size[2] ;
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const Member & member ) const
+    { m_functor( member ); }
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const Member & member ) const
+    { m_functor( TagType() , member ); }
+
+public:
+
+  __device__ inline
+  void operator()(void) const
+  {
+    // Iterate this block through the league
+    int threadid = 0;
+    if ( m_scratch_size[1]>0 ) {
+      __shared__ int base_thread_id;
+      if (threadIdx.x==0 && threadIdx.y==0 ) {
+        threadid = (blockIdx.x*blockDim.z + threadIdx.z) %
+          (Kokkos::Impl::g_device_cuda_lock_arrays.n / (blockDim.x * blockDim.y));
+        threadid *= blockDim.x * blockDim.y;
+        int done = 0;
+        while (!done) {
+          done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
+          if(!done) {
+            threadid += blockDim.x * blockDim.y;
+            if(threadid+blockDim.x * blockDim.y >= Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
+          }
+        }
+        base_thread_id = threadid;
+      }
+      __syncthreads();
+      threadid = base_thread_id;
+    }
+
+
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
+
+      this-> template exec_team< WorkTag >(
+        typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
+                                    , m_shmem_begin
+                                    , m_shmem_size
+                                    , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
+                                    , m_scratch_size[1]
+                                    , league_rank
+                                    , m_league_size ) );
+    }
+    if ( m_scratch_size[1]>0 ) {
+      __syncthreads();
+      if (threadIdx.x==0 && threadIdx.y==0 )
+        Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
+    }
+  }
+
+  inline
+  void execute() const
+    {
+      const int64_t shmem_size_total = m_shmem_begin + m_shmem_size ;
+      const dim3 grid( int(m_league_size) , 1 , 1 );
+      const dim3 block( int(m_vector_size) , int(m_team_size) , 1 );
+
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
+
+    }
+
+  ParallelFor( const FunctorType  & arg_functor
+             , const Policy       & arg_policy
+             )
+    : m_functor( arg_functor )
+    , m_league_size( arg_policy.league_size() )
+    , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+        Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() )
+    , m_vector_size( arg_policy.vector_length() )
+    , m_shmem_begin( sizeof(double) * ( m_team_size + 2 ) )
+    , m_shmem_size( arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) )
+    , m_scratch_ptr{NULL,NULL}
+    , m_scratch_size{arg_policy.scratch_size(0,m_team_size),arg_policy.scratch_size(1,m_team_size)}
+    {
+      // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+      m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size)));
+
+      const int shmem_size_total = m_shmem_begin + m_shmem_size ;
+      if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+        Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
+      }
+
+      if ( int(m_team_size) >
+           int(Kokkos::Impl::cuda_get_max_block_size< ParallelFor >
+                 ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
+        Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
+      }
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Cuda
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... >         Policy ;
+
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::launch_bounds LaunchBounds ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTagFwd > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTagFwd > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTagFwd > ValueJoin ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef FunctorType                           functor_type ;
+  typedef Cuda::size_type                       size_type ;
+
+  // Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+  size_type *         m_scratch_space ;
+  size_type *         m_scratch_flags ;
+  size_type *         m_unified_space ;
+
+  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
+  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
+  // Some crutch to do function overloading
+private:
+  typedef double DummyShflReductionType;
+  typedef int DummySHMEMReductionType;
+
+public:
+  // Make the exec_range calls call to Reduce::DeviceIterateTile
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const Member & i , reference_type update ) const
+    { m_functor( i , update ); }
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const Member & i , reference_type update ) const
+    { m_functor( TagType() , i , update ); }
+
+  __device__ inline
+  void operator() () const {
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+  }
+
+  __device__ inline
+  void run(const DummySHMEMReductionType& ) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
+
+    {
+      reference_type value =
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
+
+      // Number of blocks is bounded so that the reduction can be limited to two passes.
+      // Each thread block is given an approximately equal amount of work to perform.
+      // Accumulate the values for this block.
+      // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+      const WorkRange range( m_policy , blockIdx.x , gridDim.x );
+
+      for ( Member iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
+            iwork < iwork_end ; iwork += blockDim.y ) {
+        this-> template exec_range< WorkTag >( iwork , value );
+      }
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTagFwd>(
+           ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+
+      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.y == 0 ) {
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
+    }
+  }
+
+  __device__ inline
+   void run(const DummyShflReductionType&) const
+   {
+
+     value_type value;
+     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
+     // Number of blocks is bounded so that the reduction can be limited to two passes.
+     // Each thread block is given an approximately equal amount of work to perform.
+     // Accumulate the values for this block.
+     // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+     const WorkRange range( m_policy , blockIdx.x , gridDim.x );
+
+     for ( Member iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
+           iwork < iwork_end ; iwork += blockDim.y ) {
+       this-> template exec_range< WorkTag >( iwork , value );
+     }
+
+     pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+
+     int max_active_thread = range.end()-range.begin() < blockDim.y ? range.end() - range.begin():blockDim.y;
+
+     max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
+
+    value_type init;
+    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
+     if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTagFwd>
+            (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
+       const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+       if(id==0) {
+         Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
+         *result = value;
+       }
+     }
+   }
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      unsigned n = CudaTraits::WarpSize * 8 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  inline
+  void execute()
+    {
+      const int nwork = m_policy.end() - m_policy.begin();
+      if ( nwork ) {
+        const int block_size = local_block_size( m_functor );
+
+        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
+        m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+        m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
+
+        // REQUIRED ( 1 , N , 1 )
+        const dim3 block( 1 , block_size , 1 );
+        // Required grid.x <= block.y
+        const dim3 grid( std::min( int(block.y) , int( ( nwork + block.y - 1 ) / block.y ) ) , 1 , 1 );
+
+      const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
+
+      CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
+
+      Cuda::fence();
+
+      if ( m_result_ptr ) {
+        if ( m_unified_space ) {
+          const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer)  );
+          for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
+        }
+        else {
+          const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer)  );
+          DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
+        }
+      }
+    }
+    else {
+      if (m_result_ptr) {
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
+      }
+    }
+  }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const HostViewType & arg_result
+                , typename std::enable_if<
+                   Kokkos::is_view< HostViewType >::value
+                ,void*>::type = NULL)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( InvalidType() )
+  , m_result_ptr( arg_result.data() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  { }
+
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ReducerType & reducer)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.view().data() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  { }
+};
+
+
+// MDRangePolicy impl
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::MDRangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Cuda
+                    >
+{
+private:
+
+  typedef Kokkos::MDRangePolicy< Traits ... > Policy ;
+  typedef typename Policy::array_index_type                 array_index_type;
+  typedef typename Policy::index_type                       index_type;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::launch_bounds LaunchBounds;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTagFwd > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTagFwd > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTagFwd > ValueJoin ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef FunctorType                           functor_type ;
+  typedef Cuda::size_type                       size_type ;
+
+  // Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ; // used for workrange and nwork
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+  size_type *         m_scratch_space ;
+  size_type *         m_scratch_flags ;
+  size_type *         m_unified_space ;
+
+  typedef typename Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
+
+  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
+  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
+  // Some crutch to do function overloading
+private:
+  typedef double DummyShflReductionType;
+  typedef int DummySHMEMReductionType;
+
+public:
+  inline
+  __device__
+  void
+  exec_range( reference_type update ) const
+  {
+    Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
+  }
+
+  inline
+  __device__
+  void operator() (void) const {
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+  }
+
+  __device__ inline
+  void run(const DummySHMEMReductionType& ) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
+
+    {
+      reference_type value =
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
+
+      // Number of blocks is bounded so that the reduction can be limited to two passes.
+      // Each thread block is given an approximately equal amount of work to perform.
+      // Accumulate the values for this block.
+      // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+      this-> exec_range( value );
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    // Problem: non power-of-two blockDim
+    if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTagFwd>(
+           ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.y == 0 ) {
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
+    }
+  }
+
+  __device__ inline
+   void run(const DummyShflReductionType&) const
+   {
+
+     value_type value;
+     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
+     // Number of blocks is bounded so that the reduction can be limited to two passes.
+     // Each thread block is given an approximately equal amount of work to perform.
+     // Accumulate the values for this block.
+     // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+     const Member work_part =
+       ( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion of tiles handled by each block
+
+     this-> exec_range( value );
+
+     pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+
+     int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y;
+     max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
+
+     value_type init;
+     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
+     if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTagFwd>
+         (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
+       const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+       if(id==0) {
+         Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
+         *result = value;
+       }
+     }
+   }
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      unsigned n = CudaTraits::WarpSize * 8 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  inline
+  void execute()
+    {
+      const int nwork = m_policy.m_num_tiles;
+      if ( nwork ) {
+        int block_size = m_policy.m_prod_tile_dims;
+        // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
+        // Nearest power of two
+        int exponent_pow_two = std::ceil( std::log2(block_size) );
+        block_size = std::pow(2, exponent_pow_two);
+        int suggested_blocksize = local_block_size( m_functor );
+
+        block_size = (block_size > suggested_blocksize) ? block_size : suggested_blocksize ; //Note: block_size must be less than or equal to 512
+
+
+        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
+        m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+        m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
+
+        // REQUIRED ( 1 , N , 1 )
+        const dim3 block( 1 , block_size , 1 );
+        // Required grid.x <= block.y
+        const dim3 grid( std::min( int(block.y) , int( nwork ) ) , 1 , 1 );
+
+      const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
+
+      CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
+
+      Cuda::fence();
+
+      if ( m_result_ptr ) {
+        if ( m_unified_space ) {
+          const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer)  );
+          for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
+        }
+        else {
+          const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer)  );
+          DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
+        }
+      }
+    }
+    else {
+      if (m_result_ptr) {
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
+      }
+    }
+  }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const HostViewType & arg_result
+                , typename std::enable_if<
+                   Kokkos::is_view< HostViewType >::value
+                ,void*>::type = NULL)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( InvalidType() )
+  , m_result_ptr( arg_result.data() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  {}
+
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ReducerType & reducer)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.view().data() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  {}
+};
+
+
+//----------------------------------------------------------------------------
+
+#if 1
+
+template< class FunctorType , class ReducerType, class ... Properties >
+class ParallelReduce< FunctorType
+                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
+                    , Kokkos::Cuda
+                    >
+{
+private:
+
+  typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... >  Policy ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds     LaunchBounds ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTagFwd > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTagFwd > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTagFwd > ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+
+public:
+
+  typedef FunctorType      functor_type ;
+  typedef Cuda::size_type  size_type ;
+
+  enum { UseShflReduction = (true && ValueTraits::StaticValueSize) };
+
+private:
+  typedef double DummyShflReductionType;
+  typedef int DummySHMEMReductionType;
+
+  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
+  // shared memory utilization:
+  //
+  //  [ global reduce space ]
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+  //
+
+  const FunctorType   m_functor ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+  size_type *         m_scratch_space ;
+  size_type *         m_scratch_flags ;
+  size_type *         m_unified_space ;
+  size_type           m_team_begin ;
+  size_type           m_shmem_begin ;
+  size_type           m_shmem_size ;
+  void*               m_scratch_ptr[2] ;
+  int                 m_scratch_size[2] ;
+  const size_type     m_league_size ;
+  const size_type     m_team_size ;
+  const size_type     m_vector_size ;
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const Member & member , reference_type update ) const
+    { m_functor( member , update ); }
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const Member & member , reference_type update ) const
+    { m_functor( TagType() , member , update ); }
+
+public:
+
+  __device__ inline
+  void operator() () const {
+    int threadid = 0;
+    if ( m_scratch_size[1]>0 ) {
+      __shared__ int base_thread_id;
+      if (threadIdx.x==0 && threadIdx.y==0 ) {
+        threadid = (blockIdx.x*blockDim.z + threadIdx.z) %
+          (Kokkos::Impl::g_device_cuda_lock_arrays.n / (blockDim.x * blockDim.y));
+        threadid *= blockDim.x * blockDim.y;
+        int done = 0;
+        while (!done) {
+          done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
+          if(!done) {
+            threadid += blockDim.x * blockDim.y;
+            if(threadid + blockDim.x * blockDim.y >= Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
+          }
+        }
+        base_thread_id = threadid;
+      }
+      __syncthreads();
+      threadid = base_thread_id;
+    }
+
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0), threadid );
+    if ( m_scratch_size[1]>0 ) {
+      __syncthreads();
+      if (threadIdx.x==0 && threadIdx.y==0 )
+        Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
+    }
+  }
+
+  __device__ inline
+  void run(const DummySHMEMReductionType&, const int& threadid) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
+
+    reference_type value =
+      ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
+
+    // Iterate this block through the league
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
+      this-> template exec_team< WorkTag >
+        ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
+                                        , m_shmem_begin
+                                        , m_shmem_size
+                                        , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
+                                        , m_scratch_size[1]
+                                        , league_rank
+                                        , m_league_size )
+        , value );
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    if ( cuda_single_inter_block_reduce_scan<false,FunctorType,WorkTag>(
+           ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+
+      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.y == 0 ) {
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
+    }
+
+  }
+
+  __device__ inline
+  void run(const DummyShflReductionType&, const int& threadid) const
+  {
+    value_type value;
+    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
+
+    // Iterate this block through the league
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
+      this-> template exec_team< WorkTag >
+        ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
+                                        , m_shmem_begin
+                                        , m_shmem_size
+                                        , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
+                                        , m_scratch_size[1]
+                                        , league_rank
+                                        , m_league_size )
+        , value );
+    }
+
+    pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+
+    value_type init;
+    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
+    if(Impl::cuda_inter_block_reduction<FunctorType,ValueJoin,WorkTag>
+           (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)) {
+      const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+      if(id==0) {
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
+        *result = value;
+      }
+    }
+  }
+
+  inline
+  void execute()
+    {
+      const int nwork = m_league_size * m_team_size ;
+      if ( nwork ) {
+        const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) )
+          :std::min( m_league_size , m_team_size );
+
+        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count );
+        m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+        m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
+
+        const dim3 block( m_vector_size , m_team_size , 1 );
+        const dim3 grid( block_count , 1 , 1 );
+        const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
+
+        CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
+
+        Cuda::fence();
+
+        if ( m_result_ptr ) {
+          if ( m_unified_space ) {
+            const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+            for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
+          }
+          else {
+            const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
+            DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
+          }
+        }
+      }
+      else {
+        if (m_result_ptr) {
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
+        }
+      }
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const HostViewType & arg_result
+                , typename std::enable_if<
+                                   Kokkos::is_view< HostViewType >::value
+                                ,void*>::type = NULL)
+  : m_functor( arg_functor )
+  , m_reducer( InvalidType() )
+  , m_result_ptr( arg_result.data() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  , m_team_begin( 0 )
+  , m_shmem_begin( 0 )
+  , m_shmem_size( 0 )
+  , m_scratch_ptr{NULL,NULL}
+  , m_scratch_size{
+    arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+        Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+                                                                 arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
+                                                                 arg_policy.vector_length() )
+    ), arg_policy.scratch_size(1,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+        Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+                                                                 arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
+                                                                 arg_policy.vector_length() )
+        )}
+  , m_league_size( arg_policy.league_size() )
+  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
+                                                               arg_policy.vector_length() )
+  , m_vector_size( arg_policy.vector_length() )
+  {
+    // Return Init value if the number of worksets is zero
+    if( arg_policy.league_size() == 0) {
+      ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , arg_result.data() );
+      return ;
+    }
+
+    m_team_begin = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size );
+    m_shmem_begin = sizeof(double) * ( m_team_size + 2 );
+    m_shmem_size = arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size );
+    m_scratch_ptr[1] = cuda_resize_scratch_space(static_cast<std::int64_t>(m_scratch_size[1])*(static_cast<std::int64_t>(Cuda::concurrency()/(m_team_size*m_vector_size))));
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = arg_policy.scratch_size(1,m_team_size);
+
+    // The global parallel_reduce does not support vector_length other than 1 at the moment
+    if( (arg_policy.vector_length() > 1) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA for dynamic sized reduction types.");
+
+    if( (m_team_size < 32) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA for dynamic sized reduction types.");
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+
+    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
+
+    if (! Kokkos::Impl::is_integral_power_of_two( m_team_size )  && !UseShflReduction ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
+    }
+
+    if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
+    }
+
+    if ( unsigned(m_team_size) >
+         unsigned(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
+               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
+    }
+
+  }
+
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ReducerType & reducer)
+  : m_functor( arg_functor )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.view().data() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  , m_team_begin( 0 )
+  , m_shmem_begin( 0 )
+  , m_shmem_size( 0 )
+  , m_scratch_ptr{NULL,NULL}
+  , m_league_size( arg_policy.league_size() )
+  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
+      arg_policy.vector_length() )
+  , m_vector_size( arg_policy.vector_length() )
+  {
+    // Return Init value if the number of worksets is zero
+    if( arg_policy.league_size() == 0) {
+      ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
+      return ;
+    }
+
+    m_team_begin = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size );
+    m_shmem_begin = sizeof(double) * ( m_team_size + 2 );
+    m_shmem_size = arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size );
+    m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size)));
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = arg_policy.scratch_size(1,m_team_size);
+
+    // The global parallel_reduce does not support vector_length other than 1 at the moment
+    if( (arg_policy.vector_length() > 1) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA for dynamic sized reduction types.");
+
+    if( (m_team_size < 32) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA for dynamic sized reduction types.");
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+
+    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
+
+    if ( (! Kokkos::Impl::is_integral_power_of_two( m_team_size )  && !UseShflReduction ) ||
+         CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
+    }
+
+    if ( int(m_team_size) >
+         int(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
+               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
+    }
+
+  }
+};
+
+//----------------------------------------------------------------------------
+#else
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ReducerType, class ... Properties >
+class ParallelReduce< FunctorType
+                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
+                    , Kokkos::Cuda
+                    >
+{
+private:
+
+  enum : int { align_scratch_value = 0x0100 /* 256 */ };
+  enum : int { align_scratch_mask  = align_scratch_value - 1 };
+
+  KOKKOS_INLINE_FUNCTION static constexpr
+  int align_scratch( const int n )
+    {
+      return ( n & align_scratch_mask )
+             ? n + align_scratch_value - ( n & align_scratch_mask ) : n ;
+    }
+
+  //----------------------------------------
+  // Reducer does not wrap a functor
+  template< class R = ReducerType , class F = void >
+  struct reducer_type : public R {
+
+    template< class S >
+    using rebind = reducer_type< typename R::rebind<S> , void > ;
+
+    KOKKOS_INLINE_FUNCTION
+    reducer_type( FunctorType const *
+                , ReducerType const * arg_reducer
+                , typename R::value_type * arg_value )
+      : R( *arg_reducer , arg_value ) {}
+  };
+
+  // Reducer does wrap a functor
+  template< class R >
+  struct reducer_type< R , FunctorType > : public R {
+
+    template< class S >
+    using rebind = reducer_type< typename R::rebind<S> , FunctorType > ;
+
+    KOKKOS_INLINE_FUNCTION
+    reducer_type( FunctorType const * arg_functor
+                , ReducerType const *
+                , typename R::value_type * arg_value )
+      : R( arg_functor , arg_value ) {}
+  };
+
+  //----------------------------------------
+
+  typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... >  Policy ;
+  typedef CudaTeamMember                           Member ;
+  typedef typename Policy::work_tag                WorkTag ;
+  typedef typename reducer_type<>::pointer_type    pointer_type ;
+  typedef typename reducer_type<>::reference_type  reference_type ;
+  typedef typename reducer_type<>::value_type      value_type ;
+  typedef typename Policy::launch_bounds           LaunchBounds ;
+
+  typedef Kokkos::Impl::FunctorAnalysis
+    < Kokkos::Impl::FunctorPatternInterface::REDUCE
+    , Policy
+    , FunctorType
+    > Analysis ;
+
+public:
+
+  typedef FunctorType      functor_type ;
+  typedef Cuda::size_type  size_type ;
+
+private:
+
+  const FunctorType     m_functor ;
+  const reducer_type<>  m_reducer ;
+  size_type *           m_scratch_space ;
+  size_type *           m_unified_space ;
+  size_type             m_team_begin ;
+  size_type             m_shmem_begin ;
+  size_type             m_shmem_size ;
+  void*                 m_scratch_ptr[2] ;
+  int                   m_scratch_size[2] ;
+  const size_type       m_league_size ;
+  const size_type       m_team_size ;
+  const size_type       m_vector_size ;
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const Member & member , reference_type update ) const
+    { m_functor( member , update ); }
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const Member & member , reference_type update ) const
+    { m_functor( TagType() , member , update ); }
+
+
+public:
+
+  __device__ inline
+  void operator() () const
+    {
+      void * const shmem = kokkos_impl_cuda_shared_memory<char>();
+
+      const bool reduce_to_host =
+        std::is_same< typename reducer_type<>::memory_space
+                    , Kokkos::HostSpace >::value &&
+        m_reducer.data();
+
+      value_type value ;
+
+      typename reducer_type<>::rebind< CudaSpace >
+        reduce( & m_functor , & m_reducer , & value );
+
+      reduce.init( reduce.data() );
+
+      // Iterate this block through the league
+
+      for ( int league_rank = blockIdx.x
+          ; league_rank < m_league_size
+          ; league_rank += gridDim.x ) {
+
+        // Initialization of team member data:
+
+        const Member member
+          ( shmem
+          , m_shmem_team_begin
+          , m_shmem_team_size
+          , reinterpret_cast<char*>(m_scratch_space) + m_global_team_begin
+          , m_global_team_size
+          , league_rank
+          , m_league_size );
+
+        ParallelReduce::template
+          exec_team< WorkTag >( member , reduce.reference() );
+      }
+
+      if ( Member::global_reduce( reduce
+                                , m_scratch_space
+                                , reinterpret_cast<char*>(m_scratch_space)
+                                  + aligned_flag_size
+                                , shmem
+                                , m_shmem_size ) ) {
+
+        // Single thread with data in value
+
+        reduce.final( reduce.data() );
+
+        if ( reduce_to_host ) {
+          reducer.copy( m_unified_space , reduce.data() );
+        }
+      }
+    }
+
+
+  inline
+  void execute()
+    {
+      const bool reduce_to_host =
+        std::is_same< typename reducer_type<>::memory_space
+                    , Kokkos::HostSpace >::value &&
+        m_reducer.data();
+
+      const bool reduce_to_gpu =
+        std::is_same< typename reducer_type<>::memory_space
+                    , Kokkos::CudaSpace >::value &&
+        m_reducer.data();
+
+      if ( m_league_size && m_team_size ) {
+
+        const int value_size = Analysis::value_size( m_functor );
+
+        m_scratch_space = cuda_internal_scratch_space( m_scratch_size );
+        m_unified_space = cuda_internal_scratch_unified( value_size );
+
+        const dim3 block( m_vector_size , m_team_size , m_team_per_block );
+        const dim3 grid( m_league_size , 1 , 1 );
+        const int  shmem = m_shmem_team_begin + m_shmem_team_size ;
+
+        // copy to device and execute
+        CudaParallelLaunch<ParallelReduce,LaunchBounds>( *this, grid, block, shmem );
+
+        Cuda::fence();
+
+        if ( reduce_to_host ) {
+          m_reducer.copy( m_reducer.data() , pointer_type(m_unified_space) );
+        }
+      }
+      else if ( reduce_to_host ) {
+        m_reducer.init( m_reducer.data() );
+      }
+      else if ( reduce_to_gpu ) {
+        value_type tmp ;
+        m_reduce.init( & tmp );
+        cudaMemcpy( m_reduce.data() , & tmp , cudaMemcpyHostToDevice );
+      }
+    }
+
+
+  /**\brief  Set up parameters and allocations for kernel launch.
+   *
+   *  block = { vector_size , team_size , team_per_block }
+   *  grid  = { number_of_teams , 1 , 1 }
+   *
+   *  shmem = shared memory for:
+   *    [ team_reduce_buffer
+   *    , team_scratch_buffer_level_0 ]
+   *  reused by:
+   *    [ global_reduce_buffer ]
+   *
+   *  global_scratch for:
+   *    [ global_reduce_flag_buffer
+   *    , global_reduce_value_buffer
+   *    , team_scratch_buffer_level_1 * max_concurrent_team ]
+   */
+
+  ParallelReduce( FunctorType && arg_functor
+                , Policy      && arg_policy
+                , ReducerType const & arg_reducer
+                )
+  : m_functor( arg_functor )
+    // the input reducer may wrap the input functor so must
+    // generate a reducer bound to the copied functor.
+  , m_reducer( & m_functor , & arg_reducer , arg_reducer.data() )
+  , m_scratch_space( 0 )
+  , m_unified_space( 0 )
+  , m_team_begin( 0 )
+  , m_shmem_begin( 0 )
+  , m_shmem_size( 0 )
+  , m_scratch_ptr{NULL,NULL}
+  , m_league_size( arg_policy.league_size() )
+  , m_team_per_block( 0 )
+  , m_team_size( arg_policy.team_size() )
+  , m_vector_size( arg_policy.vector_length() )
+  {
+    if ( 0 == m_league_size ) return ;
+
+    const int value_size = Analysis::value_size( m_functor );
+
+    //----------------------------------------
+    // Vector length must be <= WarpSize and power of two
+
+    const bool ok_vector = m_vector_size < CudaTraits::WarpSize &&
+      Kokkos::Impl::is_integral_power_of_two( m_vector_size );
+
+    //----------------------------------------
+
+    if ( 0 == m_team_size ) {
+      // Team size is AUTO, use a whole block per team.
+      // Calculate block size using the occupance calculator.
+      // Occupancy calculator assumes whole block.
+
+      m_team_size =
+        Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >
+          ( arg_functor
+          , arg_policy.vector_length()
+          , arg_policy.team_scratch_size(0)
+          , arg_policy.thread_scratch_size(0) / arg_policy.vector_length() );
+
+      m_team_per_block = 1 ;
+    }
+
+    //----------------------------------------
+    // How many CUDA threads per team.
+    // If more than a warp or multiple teams cannot exactly fill a warp
+    // then only one team per block.
+
+    const int team_threads = m_team_size * m_vector_size ;
+
+    if ( ( CudaTraits::WarpSize < team_threads ) ||
+         ( CudaTraits::WarpSize % team_threads ) ) {
+      m_team_per_block = 1 ;
+    }
+
+    //----------------------------------------
+    // How much team scratch shared memory determined from
+    // either the functor or the policy:
+
+    if ( CudaTraits::WarpSize < team_threads ) {
+      // Need inter-warp team reduction (collectives) shared memory
+      // Speculate an upper bound for the value size
+
+      m_shmem_team_begin =
+        align_scratch( CudaTraits::warp_count(team_threads) * sizeof(double) );
+    }
+
+    m_shmem_team_size = arg_policy.scratch_size(0,m_team_size);
+
+    if ( 0 == m_shmem_team_size ) {
+      m_shmem_team_size = Analysis::team_shmem_size( m_functor , m_team_size );
+    }
+
+    m_shmem_team_size = align_scratch( m_shmem_team_size );
+
+    // Can fit a team in a block:
+
+    const bool ok_shmem_team =
+      ( m_shmem_team_begin + m_shmem_team_size )
+      < CudaTraits::SharedMemoryCapacity ;
+
+    //----------------------------------------
+
+    if ( 0 == m_team_per_block ) {
+      // Potentially more than one team per block.
+      // Determine number of teams per block based upon
+      // how much team scratch can fit and exactly filling each warp.
+
+      const int team_per_warp = team_threads / CudaTraits::WarpSize ;
+
+      const int max_team_per_block =
+        Kokkos::Impl::CudaTraits::SharedMemoryCapacity
+        / shmem_team_scratch_size ;
+
+      for ( m_team_per_block = team_per_warp ;
+            m_team_per_block + team_per_warp < max_team_per_block ;
+            m_team_per_block += team_per_warp );
+    }
+
+    //----------------------------------------
+    // How much global reduce scratch shared memory.
+
+    int shmem_global_reduce_size = 8 * value_size ;
+
+    //----------------------------------------
+    // Global scratch memory requirements.
+
+    const int aligned_flag_size = align_scratch( sizeof(int) );
+
+    const int max_concurrent_block =
+      cuda_internal_maximum_concurrent_block_count();
+
+    // Reduce space has claim flag followed by vaue buffer
+    const int global_reduce_value_size =
+      max_concurrent_block *
+      ( aligned_flag_size + align_scratch( value_size ) );
+
+    // Scratch space has claim flag followed by scratch buffer
+    const int global_team_scratch_size =
+      max_concurrent_block * m_team_per_block *
+      ( aligned_flag_size +
+        align_scratch( arg_policy.scratch_size(1,m_team_size) / m_vector_size )
+      );
+
+    const int global_size = aligned_flag_size
+                          + global_reduce_value_size
+                          + global_team_scratch_size ;
+
+    m_global_reduce_begin = aligned_flag_size ;
+    m_global_team_begin   = m_global_reduce_begin + global_reduce_value_size ;
+    m_global_size         = m_global_team_begin + global_team_scratch_size ;
+  }
+};
+
+#endif
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::Cuda
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... >  Policy ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::launch_bounds  LaunchBounds ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType, WorkTag > ValueOps ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef FunctorType                           functor_type ;
+  typedef Cuda::size_type                       size_type ;
+
+private:
+
+  // Algorithmic constraints:
+  //  (a) blockDim.y is a power of two
+  //  (b) blockDim.y == blockDim.z == 1
+  //  (c) gridDim.x  <= blockDim.y * blockDim.y
+  //  (d) gridDim.y  == gridDim.z == 1
+
+  const FunctorType m_functor ;
+  const Policy      m_policy ;
+  size_type *       m_scratch_space ;
+  size_type *       m_scratch_flags ;
+  size_type         m_final ;
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const Member & i , reference_type update , const bool final_result ) const
+    { m_functor( i , update , final_result ); }
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const Member & i , reference_type update , const bool final_result ) const
+    { m_functor( TagType() , i , update , final_result ); }
+
+  //----------------------------------------
+
+  __device__ inline
+  void initial(void) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
+
+    size_type * const shared_value = kokkos_impl_cuda_shared_memory<size_type>() + word_count.value * threadIdx.y ;
+
+    ValueInit::init( m_functor , shared_value );
+
+    // Number of blocks is bounded so that the reduction can be limited to two passes.
+    // Each thread block is given an approximately equal amount of work to perform.
+    // Accumulate the values for this block.
+    // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+    const WorkRange range( m_policy , blockIdx.x , gridDim.x );
+
+    for ( Member iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
+          iwork < iwork_end ; iwork += blockDim.y ) {
+      this-> template exec_range< WorkTag >( iwork , ValueOps::reference( shared_value ) , false );
+    }
+
+    // Reduce and scan, writing out scan of blocks' totals and block-groups' totals.
+    // Blocks' scan values are written to 'blockIdx.x' location.
+    // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i < gridDim.x
+    cuda_single_inter_block_reduce_scan<true,FunctorType,WorkTag>( m_functor , blockIdx.x , gridDim.x , kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags );
+  }
+
+  //----------------------------------------
+
+  __device__ inline
+  void final(void) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
+
+    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , value[2] , ... }
+    size_type * const shared_data   = kokkos_impl_cuda_shared_memory<size_type>();
+    size_type * const shared_prefix = shared_data + word_count.value * threadIdx.y ;
+    size_type * const shared_accum  = shared_data + word_count.value * ( blockDim.y + 1 );
+
+    // Starting value for this thread block is the previous block's total.
+    if ( blockIdx.x ) {
+      size_type * const block_total = m_scratch_space + word_count.value * ( blockIdx.x - 1 );
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i] ; }
+    }
+    else if ( 0 == threadIdx.y ) {
+      ValueInit::init( m_functor , shared_accum );
+    }
+
+    const WorkRange range( m_policy , blockIdx.x , gridDim.x );
+
+    for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) {
+
+      const typename Policy::member_type iwork = iwork_base + threadIdx.y ;
+
+      __syncthreads(); // Don't overwrite previous iteration values until they are used
+
+      ValueInit::init( m_functor , shared_prefix + word_count.value );
+
+      // Copy previous block's accumulation total into thread[0] prefix and inclusive scan value of this block
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) {
+        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
+
+      // Call functor to accumulate inclusive scan value for this work item
+      if ( iwork < range.end() ) {
+        this-> template exec_range< WorkTag >( iwork , ValueOps::reference( shared_prefix + word_count.value ) , false );
+      }
+
+      // Scan block values into locations shared_data[1..blockDim.y]
+      cuda_intra_block_reduce_scan<true,FunctorType,WorkTag>( m_functor , typename ValueTraits::pointer_type(shared_data+word_count.value) );
+
+      {
+        size_type * const block_total = shared_data + word_count.value * blockDim.y ;
+        for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i]; }
+      }
+
+      // Call functor with exclusive scan value
+      if ( iwork < range.end() ) {
+        this-> template exec_range< WorkTag >( iwork , ValueOps::reference( shared_prefix ) , true );
+      }
+    }
+  }
+
+public:
+
+  //----------------------------------------
+
+  __device__ inline
+  void operator()(void) const
+  {
+    if ( ! m_final ) {
+      initial();
+    }
+    else {
+      final();
+    }
+  }
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512 (16 warps)
+      // gridDim.x <= blockDim.y * blockDim.y
+      //
+      // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit testing
+
+      unsigned n = CudaTraits::WarpSize * 4 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  inline
+  void execute()
+    {
+      const int nwork    = m_policy.end() - m_policy.begin();
+      if ( nwork ) {
+        enum { GridMaxComputeCapability_2x = 0x0ffff };
+
+        const int block_size = local_block_size( m_functor );
+
+        const int grid_max =
+          ( block_size * block_size ) < GridMaxComputeCapability_2x ?
+          ( block_size * block_size ) : GridMaxComputeCapability_2x ;
+
+        // At most 'max_grid' blocks:
+        const int max_grid = std::min( int(grid_max) , int(( nwork + block_size - 1 ) / block_size ));
+
+        // How much work per block:
+        const int work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
+
+        // How many block are really needed for this much work:
+        const int grid_x = ( nwork + work_per_block - 1 ) / work_per_block ;
+
+        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( m_functor ) * grid_x );
+        m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );
+
+        const dim3 grid( grid_x , 1 , 1 );
+        const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 )
+        const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );
+
+        m_final = false ;
+        CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
+
+        m_final = true ;
+        CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
+      }
+    }
+
+  ParallelScan( const FunctorType  & arg_functor ,
+                const Policy       & arg_policy )
+  : m_functor( arg_functor )
+  , m_policy( arg_policy )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_final( false )
+  { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+  template< class FunctorType, class ExecPolicy, class ValueType , class Tag = typename ExecPolicy::work_tag>
+  struct CudaFunctorAdapter {
+    const FunctorType f;
+    typedef ValueType value_type;
+    CudaFunctorAdapter(const FunctorType& f_):f(f_) {}
+
+    __device__ inline
+    void operator() (typename ExecPolicy::work_tag, const typename ExecPolicy::member_type& i, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals third argument type of FunctorType::operator()
+      f(typename ExecPolicy::work_tag(), i, val);
+    }
+
+    __device__ inline
+    void operator() (typename ExecPolicy::work_tag, const typename ExecPolicy::member_type& i, const typename ExecPolicy::member_type& j, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals third argument type of FunctorType::operator()
+      f(typename ExecPolicy::work_tag(), i, j, val);
+    }
+
+    __device__ inline
+    void operator() (typename ExecPolicy::work_tag, const typename ExecPolicy::member_type& i, const typename ExecPolicy::member_type& j, const typename ExecPolicy::member_type& k, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals third argument type of FunctorType::operator()
+      f(typename ExecPolicy::work_tag(), i, j, k, val);
+    }
+
+    __device__ inline
+    void operator() (typename ExecPolicy::work_tag, const typename ExecPolicy::member_type& i, const typename ExecPolicy::member_type& j, const typename ExecPolicy::member_type& k, const typename ExecPolicy::member_type& l, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals third argument type of FunctorType::operator()
+      f(typename ExecPolicy::work_tag(), i, j, k, l, val);
+    }
+
+    __device__ inline
+    void operator() (typename ExecPolicy::work_tag, const typename ExecPolicy::member_type& i, const typename ExecPolicy::member_type& j, const typename ExecPolicy::member_type& k, const typename ExecPolicy::member_type& l, const typename ExecPolicy::member_type& m, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals third argument type of FunctorType::operator()
+      f(typename ExecPolicy::work_tag(), i, j, k, l, m, val);
+    }
+
+    __device__ inline
+    void operator() (typename ExecPolicy::work_tag, const typename ExecPolicy::member_type& i, const typename ExecPolicy::member_type& j, const typename ExecPolicy::member_type& k, const typename ExecPolicy::member_type& l, const typename ExecPolicy::member_type& m, const typename ExecPolicy::member_type& n, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals third argument type of FunctorType::operator()
+      f(typename ExecPolicy::work_tag(), i, j, k, l, m, n, val);
+    }
+
+  };
+
+  template< class FunctorType, class ExecPolicy, class ValueType >
+  struct CudaFunctorAdapter<FunctorType,ExecPolicy,ValueType,void> {
+    const FunctorType f;
+    typedef ValueType value_type;
+    CudaFunctorAdapter(const FunctorType& f_):f(f_) {}
+
+    __device__ inline
+    void operator() (const typename ExecPolicy::member_type& i, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,val);
+    }
+
+    __device__ inline
+    void operator() (const typename ExecPolicy::member_type& i, const typename ExecPolicy::member_type& j, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,j,val);
+    }
+
+    __device__ inline
+    void operator() (const typename ExecPolicy::member_type& i, const typename ExecPolicy::member_type& j, const typename ExecPolicy::member_type& k, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,j,k,val);
+    }
+
+    __device__ inline
+    void operator() (const typename ExecPolicy::member_type& i, const typename ExecPolicy::member_type& j, const typename ExecPolicy::member_type& k, const typename ExecPolicy::member_type& l, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,j,k,l,val);
+    }
+
+    __device__ inline
+    void operator() (const typename ExecPolicy::member_type& i, const typename ExecPolicy::member_type& j, const typename ExecPolicy::member_type& k, const typename ExecPolicy::member_type& l, const typename ExecPolicy::member_type& m, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,j,k,l,m,val);
+    }
+
+    __device__ inline
+    void operator() (const typename ExecPolicy::member_type& i, const typename ExecPolicy::member_type& j, const typename ExecPolicy::member_type& k, const typename ExecPolicy::member_type& l, const typename ExecPolicy::member_type& m, const typename ExecPolicy::member_type& n, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,j,k,l,m,n,val);
+    }
+
+
+    __device__ inline
+    void operator() (typename ExecPolicy::member_type& i, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,val);
+    }
+
+    __device__ inline
+    void operator() (typename ExecPolicy::member_type& i, typename ExecPolicy::member_type& j, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,j,val);
+    }
+
+    __device__ inline
+    void operator() (typename ExecPolicy::member_type& i, typename ExecPolicy::member_type& j, typename ExecPolicy::member_type& k, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,j,k,val);
+    }
+
+    __device__ inline
+    void operator() (typename ExecPolicy::member_type& i, typename ExecPolicy::member_type& j, typename ExecPolicy::member_type& k, typename ExecPolicy::member_type& l, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,j,k,l,val);
+    }
+
+    __device__ inline
+    void operator() (typename ExecPolicy::member_type& i, typename ExecPolicy::member_type& j, typename ExecPolicy::member_type& k, typename ExecPolicy::member_type& l, typename ExecPolicy::member_type& m, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,j,k,l,m,val);
+    }
+
+    __device__ inline
+    void operator() (typename ExecPolicy::member_type& i, typename ExecPolicy::member_type& j, typename ExecPolicy::member_type& k, typename ExecPolicy::member_type& l, typename ExecPolicy::member_type& m, typename ExecPolicy::member_type& n, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,j,k,l,m,n,val);
+    }
+
+  };
+
+  template<class FunctorType, class ResultType, class Tag, bool Enable = IsNonTrivialReduceFunctor<FunctorType>::value >
+  struct FunctorReferenceType {
+    typedef ResultType& reference_type;
+  };
+
+  template<class FunctorType, class ResultType, class Tag>
+  struct FunctorReferenceType<FunctorType, ResultType, Tag, true> {
+    typedef typename Kokkos::Impl::FunctorValueTraits< FunctorType ,Tag >::reference_type reference_type;
+  };
+
+  template< class FunctorTypeIn, class ExecPolicy, class ValueType>
+  struct ParallelReduceFunctorType<FunctorTypeIn,ExecPolicy,ValueType,Cuda> {
+
+    enum {FunctorHasValueType = IsNonTrivialReduceFunctor<FunctorTypeIn>::value };
+    typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ValueType> >::type functor_type;
+    static functor_type functor(const FunctorTypeIn& functor_in) {
+      return Impl::if_c<FunctorHasValueType,FunctorTypeIn,functor_type>::select(functor_in,functor_type(functor_in));
+    }
+  };
+
+}
+
+} // namespace Kokkos
+
+#endif /* defined( __CUDACC__ ) */
+#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd4d6303f6d572d401e11d21ca42425518515476
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -0,0 +1,699 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_REDUCESCAN_HPP
+#define KOKKOS_CUDA_REDUCESCAN_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+
+#include <utility>
+
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// Shuffle operations require input to be a register (stack) variable
+
+template< typename T >
+__device__ inline
+void cuda_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width
+  , unsigned mask = 0xffffffff )
+{
+  *reinterpret_cast<int*>(&out) =
+    KOKKOS_IMPL_CUDA_SHFL_MASK( mask , *reinterpret_cast<int const *>(&in) , lane , width );
+}
+
+template< typename T >
+__device__ inline
+void cuda_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width, unsigned mask = 0xffffffff )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      KOKKOS_IMPL_CUDA_SHFL_MASK( mask , reinterpret_cast<int const *>(&in)[i] , lane , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+__device__ inline
+void cuda_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width , unsigned mask = 0xffffffff )
+{
+  *reinterpret_cast<int*>(&out) =
+    KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK( mask , *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+__device__ inline
+void cuda_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width , unsigned mask = 0xffffffff )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK( mask , reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+__device__ inline
+void cuda_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width , unsigned mask = 0xffffffff )
+{
+  *reinterpret_cast<int*>(&out) =
+    KOKKOS_IMPL_CUDA_SHFL_UP_MASK( mask , *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+__device__ inline
+void cuda_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width , unsigned mask = 0xffffffff )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      KOKKOS_IMPL_CUDA_SHFL_UP_MASK( mask , reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+/*
+ *  Algorithmic constraints:
+ *   (a) threads with same threadIdx.y have same value
+ *   (b) blockDim.x == power of two
+ *   (c) blockDim.z == 1
+ */
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_intra_warp_reduction( ValueType& result,
+                                       const JoinOp& join,
+                                       const uint32_t max_active_thread = blockDim.y) {
+
+  unsigned int shift = 1;
+
+  //Reduce over values from threads with different threadIdx.y
+  while(blockDim.x * shift < 32 ) {
+    const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
+    //Only join if upper thread is active (this allows non power of two for blockDim.y
+    if(threadIdx.y + shift < max_active_thread)
+      join(result , tmp);
+    shift*=2;
+  }
+
+  result = shfl(result,0,32);
+}
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_inter_warp_reduction( ValueType& value,
+                                       const JoinOp& join,
+                                       const int max_active_thread = blockDim.y) {
+
+  #define STEP_WIDTH 4
+  // Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
+  // The reason not to use ValueType directly is that for types with constructors it
+  // could lead to race conditions
+  __shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
+  ValueType* result = (ValueType*) & sh_result;
+  const int step = 32 / blockDim.x;
+  int shift = STEP_WIDTH;
+  const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
+  if(id < STEP_WIDTH ) {
+    result[id] = value;
+  }
+  __syncthreads();
+  while (shift<=max_active_thread/step) {
+    if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
+      join(result[id%STEP_WIDTH],value);
+    }
+    __syncthreads();
+    shift+=STEP_WIDTH;
+  }
+
+
+  value = result[0];
+  for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
+    join(value,result[i]);
+}
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_intra_block_reduction( ValueType& value,
+                                        const JoinOp& join,
+                                        const int max_active_thread = blockDim.y) {
+  cuda_intra_warp_reduction(value,join,max_active_thread);
+  cuda_inter_warp_reduction(value,join,max_active_thread);
+}
+
+template< class FunctorType , class JoinOp , class ArgTag = void >
+__device__
+bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  value,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  neutral,
+                                 const JoinOp& join,
+                                 Cuda::size_type * const m_scratch_space,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
+                                 Cuda::size_type * const m_scratch_flags,
+                                 const int max_active_thread = blockDim.y) {
+#ifdef __CUDA_ARCH__
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
+
+  //Do the intra-block reduction with shfl operations and static shared memory
+  cuda_intra_block_reduction(value,join,max_active_thread);
+
+  const int id = threadIdx.y*blockDim.x + threadIdx.x;
+
+  //One thread in the block writes block result to global scratch_memory
+  if(id == 0 ) {
+    pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
+    *global = value;
+  }
+
+  //One warp of last block performs inter block reduction through loading the block values from global scratch_memory
+  bool last_block = false;
+
+  __syncthreads();
+  if ( id < 32 ) {
+    Cuda::size_type count;
+
+    //Figure out whether this is the last block
+    if(id == 0)
+      count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
+    count = Kokkos::shfl(count,0,32);
+
+    //Last block does the inter block reduction
+    if( count == gridDim.x - 1) {
+      //set flag back to zero
+      if(id == 0)
+        *m_scratch_flags = 0;
+      last_block = true;
+      value = neutral;
+
+      pointer_type const volatile global = (pointer_type) m_scratch_space ;
+
+      //Reduce all global values with splitting work over threads in one warp
+      const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
+      for(int i=id; i<(int)gridDim.x; i+=step_size) {
+        value_type tmp = global[i];
+        join(value, tmp);
+      }
+
+      //Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
+      if (int(blockDim.x*blockDim.y) > 1) {
+        value_type tmp = Kokkos::shfl_down(value, 1,32);
+        if( id + 1 < int(gridDim.x) )
+          join(value, tmp);
+      }
+      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
+      if (int(blockDim.x*blockDim.y) > 2) {
+        value_type tmp = Kokkos::shfl_down(value, 2,32);
+        if( id + 2 < int(gridDim.x) )
+          join(value, tmp);
+      }
+      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+      if (int(blockDim.x*blockDim.y) > 4) {
+        value_type tmp = Kokkos::shfl_down(value, 4,32);
+        if( id + 4 < int(gridDim.x) )
+          join(value, tmp);
+      }
+      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+      if (int(blockDim.x*blockDim.y) > 8) {
+        value_type tmp = Kokkos::shfl_down(value, 8,32);
+        if( id + 8 < int(gridDim.x) )
+          join(value, tmp);
+      }
+      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+      if (int(blockDim.x*blockDim.y) > 16) {
+        value_type tmp = Kokkos::shfl_down(value, 16,32);
+        if( id + 16 < int(gridDim.x) )
+          join(value, tmp);
+      }
+      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+    }
+  }
+  //The last block has in its thread=0 the global reduction value through "value"
+  return last_block;
+#else
+  return true;
+#endif
+}
+
+template< class ReducerType >
+__device__ inline
+typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
+cuda_intra_warp_reduction( const ReducerType& reducer,
+                           const uint32_t max_active_thread = blockDim.y) {
+
+  typedef typename ReducerType::value_type ValueType;
+
+  unsigned int shift = 1;
+
+  ValueType result = reducer.reference();
+  //Reduce over values from threads with different threadIdx.y
+  while(blockDim.x * shift < 32 ) {
+    const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
+    //Only join if upper thread is active (this allows non power of two for blockDim.y
+    if(threadIdx.y + shift < max_active_thread)
+      reducer.join(result , tmp);
+    shift*=2;
+  }
+
+  result = shfl(result,0,32);
+  reducer.reference() = result;
+}
+
+template< class ReducerType >
+__device__ inline
+typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
+cuda_inter_warp_reduction( const ReducerType& reducer,
+                           const int max_active_thread = blockDim.y) {
+
+  typedef typename ReducerType::value_type ValueType;
+
+  #define STEP_WIDTH 4
+  // Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
+  // The reason not to use ValueType directly is that for types with constructors it
+  // could lead to race conditions
+  __shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
+  ValueType* result = (ValueType*) & sh_result;
+  ValueType value = reducer.reference();
+  const int step = 32 / blockDim.x;
+  int shift = STEP_WIDTH;
+  const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
+  if(id < STEP_WIDTH ) {
+    result[id] = value;
+  }
+  __syncthreads();
+  while (shift<=max_active_thread/step) {
+    if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
+      reducer.join(result[id%STEP_WIDTH],value);
+    }
+    __syncthreads();
+    shift+=STEP_WIDTH;
+  }
+
+
+  value = result[0];
+  for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
+    reducer.join(value,result[i]);
+
+  reducer.reference() = value;
+}
+
+template< class ReducerType >
+__device__ inline
+typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
+cuda_intra_block_reduction( const ReducerType& reducer,
+                            const int max_active_thread = blockDim.y) {
+  cuda_intra_warp_reduction(reducer,max_active_thread);
+  cuda_inter_warp_reduction(reducer,max_active_thread);
+}
+
+template< class ReducerType>
+__device__ inline
+typename std::enable_if< Kokkos::is_reducer<ReducerType>::value , bool >::type
+cuda_inter_block_reduction( const ReducerType& reducer,
+                            Cuda::size_type * const m_scratch_space,
+                            Cuda::size_type * const m_scratch_flags,
+                            const int max_active_thread = blockDim.y) {
+#ifdef __CUDA_ARCH__
+  typedef typename ReducerType::value_type* pointer_type;
+  typedef typename ReducerType::value_type value_type;
+
+  //Do the intra-block reduction with shfl operations and static shared memory
+  cuda_intra_block_reduction(reducer,max_active_thread);
+
+  value_type value = reducer.reference();
+
+  const int id = threadIdx.y*blockDim.x + threadIdx.x;
+
+  //One thread in the block writes block result to global scratch_memory
+  if(id == 0 ) {
+    pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
+    *global = value;
+  }
+
+  //One warp of last block performs inter block reduction through loading the block values from global scratch_memory
+  bool last_block = false;
+
+  __syncthreads();
+  if ( id < 32 ) {
+    Cuda::size_type count;
+
+    //Figure out whether this is the last block
+    if(id == 0)
+      count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
+    count = Kokkos::shfl(count,0,32);
+
+    //Last block does the inter block reduction
+    if( count == gridDim.x - 1) {
+      //set flag back to zero
+      if(id == 0)
+        *m_scratch_flags = 0;
+      last_block = true;
+      reducer.init(value);
+
+      pointer_type const volatile global = (pointer_type) m_scratch_space ;
+
+      //Reduce all global values with splitting work over threads in one warp
+      const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
+      for(int i=id; i<(int)gridDim.x; i+=step_size) {
+        value_type tmp = global[i];
+        reducer.join(value, tmp);
+      }
+
+      //Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
+      if (int(blockDim.x*blockDim.y) > 1) {
+        value_type tmp = Kokkos::shfl_down(value, 1,32);
+        if( id + 1 < int(gridDim.x) )
+          reducer.join(value, tmp);
+      }
+      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
+      if (int(blockDim.x*blockDim.y) > 2) {
+        value_type tmp = Kokkos::shfl_down(value, 2,32);
+        if( id + 2 < int(gridDim.x) )
+          reducer.join(value, tmp);
+      }
+      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+      if (int(blockDim.x*blockDim.y) > 4) {
+        value_type tmp = Kokkos::shfl_down(value, 4,32);
+        if( id + 4 < int(gridDim.x) )
+          reducer.join(value, tmp);
+      }
+      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+      if (int(blockDim.x*blockDim.y) > 8) {
+        value_type tmp = Kokkos::shfl_down(value, 8,32);
+        if( id + 8 < int(gridDim.x) )
+          reducer.join(value, tmp);
+      }
+      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+      if (int(blockDim.x*blockDim.y) > 16) {
+        value_type tmp = Kokkos::shfl_down(value, 16,32);
+        if( id + 16 < int(gridDim.x) )
+          reducer.join(value, tmp);
+      }
+      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+    }
+  }
+
+  //The last block has in its thread=0 the global reduction value through "value"
+  return last_block;
+#else
+  return true;
+#endif
+}
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize shared memory and minimize L1 cache:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
+// For 2.0 capability: 48 KB shared and 16 KB L1
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/*
+ *  Algorithmic constraints:
+ *   (a) blockDim.y is a power of two
+ *   (b) blockDim.y <= 512
+ *   (c) blockDim.x == blockDim.z == 1
+ */
+
+template< bool DoScan , class FunctorType , class ArgTag >
+__device__
+void cuda_intra_block_reduce_scan( const FunctorType & functor ,
+                                   const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data )
+{
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type  pointer_type ;
+
+  const unsigned value_count   = ValueTraits::value_count( functor );
+  const unsigned BlockSizeMask = blockDim.y - 1 ;
+
+  // Must have power of two thread count
+
+  if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); }
+
+#define BLOCK_REDUCE_STEP( R , TD , S )  \
+  if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); }
+
+#define BLOCK_SCAN_STEP( TD , N , S )  \
+  if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); }
+
+  const unsigned     rtid_intra = threadIdx.y ^ BlockSizeMask ;
+  const pointer_type tdata_intra = base_data + value_count * threadIdx.y ;
+
+  { // Intra-warp reduction:
+    KOKKOS_IMPL_CUDA_SYNCWARP;
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
+    KOKKOS_IMPL_CUDA_SYNCWARP;
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
+    KOKKOS_IMPL_CUDA_SYNCWARP;
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
+    KOKKOS_IMPL_CUDA_SYNCWARP;
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
+    KOKKOS_IMPL_CUDA_SYNCWARP;
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
+    KOKKOS_IMPL_CUDA_SYNCWARP;
+ }
+
+  __syncthreads(); // Wait for all warps to reduce
+
+  { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
+    const unsigned rtid_inter = ( threadIdx.y ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
+
+    if ( rtid_inter < blockDim.y ) {
+
+      const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
+
+      if ( (1<<5) < BlockSizeMask ) {                        BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
+      if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
+      if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
+      if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
+
+      if ( DoScan ) {
+
+        int n = ( rtid_inter &  32 ) ?  32 : (
+                ( rtid_inter &  64 ) ?  64 : (
+                ( rtid_inter & 128 ) ? 128 : (
+                ( rtid_inter & 256 ) ? 256 : 0 )));
+
+        if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ;
+
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,8)
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,7)
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,6)
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,5)
+      }
+    }
+  }
+
+  __syncthreads(); // Wait for inter-warp reduce-scan to complete
+
+  if ( DoScan ) {
+    int n = ( rtid_intra &  1 ) ?  1 : (
+            ( rtid_intra &  2 ) ?  2 : (
+            ( rtid_intra &  4 ) ?  4 : (
+            ( rtid_intra &  8 ) ?  8 : (
+            ( rtid_intra & 16 ) ? 16 : 0 ))));
+
+    if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ;
+    #ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
+    BLOCK_SCAN_STEP(tdata_intra,n,4) __syncthreads();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,3) __syncthreads();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,2) __syncthreads();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,1) __syncthreads();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,0) __syncthreads();
+    #else
+    BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,0) __threadfence_block();
+    #endif
+  }
+
+#undef BLOCK_SCAN_STEP
+#undef BLOCK_REDUCE_STEP
+}
+
+//----------------------------------------------------------------------------
+/**\brief  Input value-per-thread starting at 'shared_data'.
+ *         Reduction value at last thread's location.
+ *
+ *  If 'DoScan' then write blocks' scan values and block-groups' scan values.
+ *
+ *  Global reduce result is in the last threads' 'shared_data' location.
+ */
+template< bool DoScan , class FunctorType , class ArgTag >
+__device__
+bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
+                                          const Cuda::size_type   block_id ,
+                                          const Cuda::size_type   block_count ,
+                                          Cuda::size_type * const shared_data ,
+                                          Cuda::size_type * const global_data ,
+                                          Cuda::size_type * const global_flags )
+{
+  typedef Cuda::size_type                  size_type ;
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+  typedef FunctorValueInit<   FunctorType , ArgTag >  ValueInit ;
+  typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  //typedef typename ValueTraits::reference_type  reference_type ;
+
+  // '__ffs' = position of the least significant bit set to 1.
+  // 'blockDim.y' is guaranteed to be a power of two so this
+  // is the integral shift value that can replace an integral divide.
+  const unsigned BlockSizeShift = __ffs( blockDim.y ) - 1 ;
+  const unsigned BlockSizeMask  = blockDim.y - 1 ;
+
+  // Must have power of two thread count
+  if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_single_inter_block_reduce_scan requires power-of-two blockDim"); }
+
+  const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+    word_count( ValueTraits::value_size( functor ) / sizeof(size_type) );
+
+  // Reduce the accumulation for the entire block.
+  cuda_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+  {
+    // Write accumulation total to global scratch space.
+    // Accumulation total is the last thread's data.
+    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
+    size_type * const global = global_data + word_count.value * block_id ;
+
+//#if (__CUDA_ARCH__ < 500)
+    for ( int i = int(threadIdx.y) ; i < int(word_count.value) ; i += int(blockDim.y) ) { global[i] = shared[i] ; }
+//#else
+//    for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
+//#endif
+
+  }
+
+  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
+  // If this block is not the last block to contribute to this group then the block is done.
+  const bool is_last_block =
+    ! __syncthreads_or( threadIdx.y ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) );
+
+  if ( is_last_block ) {
+
+    const size_type b = ( long(block_count) * long(threadIdx.y) ) >> BlockSizeShift ;
+    const size_type e = ( long(block_count) * long( threadIdx.y + 1 ) ) >> BlockSizeShift ;
+
+    {
+      void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
+      /* reference_type shared_value = */ ValueInit::init( functor , shared_ptr );
+
+      for ( size_type i = b ; i < e ; ++i ) {
+        ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
+      }
+    }
+
+    cuda_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+    if ( DoScan ) {
+
+      size_type * const shared_value = shared_data + word_count.value * ( threadIdx.y ? threadIdx.y - 1 : blockDim.y );
+
+      if ( ! threadIdx.y ) { ValueInit::init( functor , shared_value ); }
+
+      // Join previous inclusive scan value to each member
+      for ( size_type i = b ; i < e ; ++i ) {
+        size_type * const global_value = global_data + word_count.value * i ;
+        ValueJoin::join( functor , shared_value , global_value );
+        ValueOps ::copy( functor , global_value , shared_value );
+      }
+    }
+  }
+
+  return is_last_block ;
+}
+
+// Size in bytes required for inter block reduce or scan
+template< bool DoScan , class FunctorType , class ArgTag >
+inline
+unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
+{
+  return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( __CUDACC__ ) */
+#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ee949583f1363a6f8dd03dd636c9da62cf98e3d6
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@@ -0,0 +1,252 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG )
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Cuda > ;
+
+//----------------------------------------------------------------------------
+
+__device__
+void TaskQueueSpecialization< Kokkos::Cuda >::driver
+  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue 
+  , int32_t shmem_per_warp )
+{
+  using Member = TaskExec< Kokkos::Cuda > ;
+  using Queue  = TaskQueue< Kokkos::Cuda > ;
+  using task_root_type = TaskBase< void , void , void > ;
+
+  extern __shared__ int32_t shmem_all[];
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  int32_t * const warp_shmem =
+    shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
+
+  task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
+
+  const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
+
+  Member single_exec( warp_shmem , 1 );
+  Member team_exec( warp_shmem , blockDim.y );
+
+  task_root_type * task_ptr ;
+
+  // Loop until all queues are empty and no tasks in flight
+
+  do {
+
+    // Each team lead attempts to acquire either a thread team task
+    // or collection of single thread tasks for the team.
+
+    if ( 0 == warp_lane ) {
+
+      task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
+          task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
+        }
+      }
+
+#if 0
+printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
+      , uintptr_t(task_ptr));
+#endif
+
+    }
+
+    // Synchronize warp with memory fence before broadcasting task pointer:
+
+    // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
+    KOKKOS_IMPL_CUDA_SYNCWARP ;
+
+    // Broadcast task pointer:
+
+    ((int*) & task_ptr )[0] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[0] , 0 , 32 );
+    ((int*) & task_ptr )[1] = KOKKOS_IMPL_CUDA_SHFL( ((int*) & task_ptr )[1] , 0 , 32 );
+
+#if defined( KOKKOS_DEBUG )
+    KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "TaskQueue CUDA task_ptr" );
+#endif
+
+    if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
+
+    if ( end != task_ptr ) {
+
+      // Whole warp copy task's closure to/from shared memory.
+      // Use all threads of warp for coalesced read/write.
+
+      int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
+      int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
+
+      int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
+
+      // copy task closure from global to shared memory:
+
+      for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
+        warp_shmem[i] = task_mem[i] ;
+      }
+
+      // Synchronize threads of the warp and insure memory
+      // writes are visible to all threads in the warp.
+
+      // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
+      KOKKOS_IMPL_CUDA_SYNCWARP ;
+
+      if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
+        // Thread Team Task
+        (*task_shmem->m_apply)( task_shmem , & team_exec );
+      }
+      else if ( 0 == threadIdx.y ) {
+        // Single Thread Task
+        (*task_shmem->m_apply)( task_shmem , & single_exec );
+      }
+
+      // Synchronize threads of the warp and insure memory
+      // writes are visible to all threads in the warp.
+
+      // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
+      KOKKOS_IMPL_CUDA_SYNCWARP ;
+
+      // copy task closure from shared to global memory:
+
+      for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
+        task_mem[i] = warp_shmem[i] ;
+      }
+
+      // Synchronize threads of the warp and insure memory
+      // writes are visible to root thread of the warp for
+      // respawn or completion.
+
+      // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
+      KOKKOS_IMPL_CUDA_SYNCWARP ;
+
+      // If respawn requested copy respawn data back to main memory
+
+      if ( 0 == warp_lane ) {
+
+        if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
+          ( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
+          ( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
+        }
+
+        queue->complete( task_ptr );
+      }
+    }
+  } while(1);
+}
+
+namespace {
+
+__global__
+void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue 
+                            , int32_t shmem_size )
+{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); }
+
+}
+
+void TaskQueueSpecialization< Kokkos::Cuda >::execute
+  ( TaskQueue< Kokkos::Cuda > * const queue )
+{
+  const int shared_per_warp = 2048 ;
+  const int warps_per_block = 4 ;
+  const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
+  const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
+  const int shared_total = shared_per_warp * warps_per_block ;
+  const cudaStream_t stream = 0 ;
+
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+#if 0
+printf("cuda_task_queue_execute before\n");
+#endif
+
+  // Query the stack size, in bytes:
+
+  size_t previous_stack_size = 0 ;
+  CUDA_SAFE_CALL( cudaDeviceGetLimit( & previous_stack_size , cudaLimitStackSize ) );
+
+  // If not large enough then set the stack size, in bytes:
+
+  const size_t larger_stack_size = 2048 ;
+
+  if ( previous_stack_size < larger_stack_size ) {
+    CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , larger_stack_size ) );
+  }
+
+  cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp );
+
+  CUDA_SAFE_CALL( cudaGetLastError() );
+
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+  if ( previous_stack_size < larger_stack_size ) {
+    CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , previous_stack_size ) );
+  }
+
+#if 0
+printf("cuda_task_queue_execute after\n");
+#endif
+
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+#else
+void KOKKOS_CORE_SRC_CUDA_KOKKOS_CUDA_TASK_PREVENT_LINK_ERROR() {}
+#endif /* #if defined( KOKKOS_ENABLE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG ) */
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8b96ea6fe0fe551d98d6eb2ba43fa011c3abf83
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -0,0 +1,761 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
+#define KOKKOS_IMPL_CUDA_TASK_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+template< typename TaskType >
+__global__
+void set_cuda_task_base_apply_function_pointer
+  ( TaskBase<void,void,void>::function_type * ptr )
+{ *ptr = TaskType::apply ; }
+
+}
+
+template< class > class TaskExec ;
+
+template<>
+class TaskQueueSpecialization< Kokkos::Cuda >
+{
+public:
+
+  using execution_space = Kokkos::Cuda ;
+  using memory_space    = Kokkos::CudaUVMSpace ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using member_type     = TaskExec< Kokkos::Cuda > ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const ) {}
+
+  __device__
+  static void driver( queue_type * const , int32_t );
+
+  static
+  void execute( queue_type * const );
+
+  template< typename TaskType >
+  static
+  typename TaskType::function_type
+  get_function_pointer()
+    {
+      using function_type = typename TaskType::function_type ;
+
+      function_type * const ptr =
+        (function_type*) cuda_internal_scratch_unified( sizeof(function_type) );
+
+      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+      set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr);
+
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+      return *ptr ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::Cuda > ;
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/**\brief  Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
+ *         passed to tasks running in a Cuda space.
+ *
+ *  Cuda thread blocks for tasking are dimensioned:
+ *    blockDim.x == vector length
+ *    blockDim.y == team size
+ *    blockDim.z == number of teams
+ *  where
+ *    blockDim.x * blockDim.y == WarpSize
+ *
+ *  Current implementation requires blockDim.x == 1.
+ *  Vector level parallelism with blockDim.y > 1 on Volta will
+ *  require a vector-level synchronization mask for vector-level
+ *  collective operaitons.
+ *
+ *  Both single thread and thread team tasks are run by a full Cuda warp.
+ *  A single thread task is called by warp lane #0 and the remaining
+ *  lanes of the warp are idle.
+ *
+ *  When executing a single thread task the syncwarp or other
+ *  warp synchronizing functions must not be called.
+ */
+template<>
+class TaskExec< Kokkos::Cuda >
+{
+private:
+
+  enum : int { WarpSize = Kokkos::Impl::CudaTraits::WarpSize };
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
+
+  int32_t * m_team_shmem ;
+  const int m_team_size ;
+
+  // If constructed with arg_team_size == 1 the object
+  // can only be used by 0 == threadIdx.y.
+  __device__
+  TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y )
+    : m_team_shmem( arg_team_shmem )
+    , m_team_size( arg_team_size ) {}
+
+public:
+
+#if defined( __CUDA_ARCH__ )
+  __device__ int  team_rank() const { return threadIdx.y ; }
+  __device__ int  team_size() const { return m_team_size ; }
+
+  __device__ void team_barrier() const
+    {
+      if ( 1 < m_team_size ) {
+        KOKKOS_IMPL_CUDA_SYNCWARP ;
+      }
+    }
+
+  template< class ValueType >
+  __device__ void team_broadcast( ValueType & val , const int thread_id ) const
+    {
+      if ( 1 < m_team_size ) {
+        // WarpSize = blockDim.X * blockDim.y
+        // thread_id < blockDim.y
+        ValueType tmp( val ); // input might not be register variable
+        cuda_shfl( val, tmp, blockDim.x * thread_id, WarpSize );
+      }
+    }
+
+#else
+  __host__ int  team_rank() const { return 0 ; }
+  __host__ int  team_size() const { return 0 ; }
+  __host__ void team_barrier() const {}
+  template< class ValueType >
+  __host__ void team_broadcast( ValueType & , const int ) const {}
+#endif
+
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<typename iType>
+struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
+{
+  typedef iType index_type;
+  const iType start ;
+  const iType end ;
+  const iType increment ;
+  const TaskExec< Kokkos::Cuda > & thread;
+
+#if defined( __CUDA_ARCH__ )
+
+  __device__ inline
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
+    : start( threadIdx.y )
+    , end(arg_count)
+    , increment( blockDim.y )
+    , thread(arg_thread)
+    {}
+
+  __device__ inline
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread
+    , const iType & arg_start
+    , const iType & arg_end
+    )
+    : start( arg_start + threadIdx.y )
+    , end(   arg_end)
+    , increment( blockDim.y )
+    , thread( arg_thread )
+    {}
+
+#else
+
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
+
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread
+    , const iType & arg_start
+    , const iType & arg_end
+    );
+
+#endif
+
+};
+
+//----------------------------------------------------------------------------
+
+template<typename iType>
+struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
+{
+  typedef iType index_type;
+  const iType start ;
+  const iType end ;
+  const iType increment ;
+  const TaskExec< Kokkos::Cuda > & thread;
+
+#if defined( __CUDA_ARCH__ )
+
+  __device__ inline
+  ThreadVectorRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
+    : start( threadIdx.x )
+    , end(arg_count)
+    , increment( blockDim.x )
+    , thread(arg_thread)
+    {}
+
+#else
+
+  ThreadVectorRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
+
+#endif
+
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >
+TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >( thread, count );
+}
+
+template<typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct
+  < typename std::common_type<iType1,iType2>::type
+  , Impl::TaskExec< Kokkos::Cuda > >
+TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
+               , const iType1 & begin, const iType2 & end )
+{
+  typedef typename std::common_type< iType1, iType2 >::type iType;
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Cuda > >(
+           thread, iType(begin), iType(end) );
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
+               , const iType & count )
+{
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
+PerTeam(const Impl::TaskExec< Kokkos::Cuda >& thread)
+{
+  return Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >
+PerThread(const Impl::TaskExec< Kokkos::Cuda >& thread)
+{
+  return Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >(thread);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.
+*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries
+  , const Lambda& lambda
+  )
+{
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+template< typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+// reduce across corresponding lanes between team members within warp
+// assume stride*team_size == warp_size
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void strided_shfl_warp_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int team_size,
+   int stride)
+{
+  for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
+    join(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
+  }
+}
+
+// multiple within-warp non-strided reductions
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void multi_shfl_warp_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int vec_length)
+{
+  for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
+    join(val, Kokkos::shfl_down(val, lane_delta, vec_length));
+  }
+}
+
+// broadcast within warp
+template< class ValueType >
+KOKKOS_INLINE_FUNCTION
+ValueType shfl_warp_broadcast
+  (ValueType& val,
+   int src_lane,
+   int width)
+{
+  if ( 1 < width ) {
+    return Kokkos::shfl(val, src_lane, width);
+  }
+  else {
+    return val ;
+  }
+}
+
+/*// all-reduce across corresponding vector lanes between team members within warp
+// assume vec_length*team_size == warp_size
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType& join,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  strided_shfl_warp_reduction<ValueType, JoinType>(
+                          join,
+                          initialized_result,
+                          loop_boundaries.thread.team_size(),
+                          blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
+}*/
+
+// all-reduce across corresponding vector lanes between team members within warp
+// if no join() provided, use sum
+// assume vec_length*team_size == warp_size
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result) {
+
+  //TODO what is the point of creating this temporary?
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+
+    strided_shfl_warp_reduction(
+      [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+      initialized_result,
+      loop_boundaries.thread.team_size(),
+      blockDim.x);
+
+    initialized_result =
+      shfl_warp_broadcast<ValueType>(
+        initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
+  }
+}
+
+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const ReducerType& reducer) {
+
+  typedef typename ReducerType::value_type ValueType;
+  //TODO what is the point of creating this temporary?
+  ValueType result = ValueType();
+  reducer.init(result);
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+    strided_shfl_warp_reduction(
+      [&] (ValueType& val1, const ValueType& val2) { reducer.join(val1,val2); },
+      result,
+      loop_boundaries.thread.team_size(),
+      blockDim.x);
+
+    reducer.reference() =
+      shfl_warp_broadcast<ValueType>(
+        result, threadIdx.x, Impl::CudaTraits::WarpSize );
+  }
+  else {
+    reducer.reference() = result ;
+  }
+}
+// all-reduce within team members within warp
+// assume vec_length*team_size == warp_size
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+/*template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType& join,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
+}*/
+
+// all-reduce within team members within warp
+// if no join() provided, use sum
+// assume vec_length*team_size == warp_size
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  initialized_result = result;
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+    //initialized_result = multi_shfl_warp_reduction(
+    multi_shfl_warp_reduction(
+      [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+      initialized_result,
+      blockDim.x);
+
+    initialized_result =
+      shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
+  }
+}
+
+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const ReducerType& reducer) {
+
+  typedef typename ReducerType::value_type ValueType;
+
+  ValueType result = ValueType();
+  reducer.init(result);
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+    multi_shfl_warp_reduction(
+      [&] (ValueType& val1, const ValueType& val2) { reducer.join(val1,val2); },
+      result,
+      blockDim.x);
+
+    reducer.reference() =
+      shfl_warp_broadcast<ValueType>( result, 0, blockDim.x );
+  }
+  else {
+    reducer.reference() = result ;
+  }
+}
+// scan across corresponding vector lanes between team members within warp
+// assume vec_length*team_size == warp_size
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Closure & closure )
+{
+  // Extract value_type from closure
+
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+
+    // make sure all threads perform all loop iterations
+    const iType bound = loop_boundaries.end + loop_boundaries.start ;
+    const int lane = threadIdx.y * blockDim.x ;
+
+    value_type accum = 0 ;
+    value_type val, y, local_total;
+
+    for( iType i = loop_boundaries.start; i < bound; i+=loop_boundaries.increment) {
+      val = 0;
+      if ( i < loop_boundaries.end ) closure(i,val,false);
+
+      // intra-blockDim.y exclusive scan on 'val'
+      // accum = accumulated, sum in total for this iteration
+
+      // INCLUSIVE scan
+      for( int offset = blockDim.x ; offset < Impl::CudaTraits::WarpSize ; offset <<= 1 ) {
+        y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize);
+        if(lane >= offset) { val += y; }
+      }
+
+      // pass accum to all threads
+      local_total = shfl_warp_broadcast<value_type>(
+         val,
+         threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
+         Impl::CudaTraits::WarpSize);
+
+      // make EXCLUSIVE scan by shifting values over one
+      val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize);
+      if ( threadIdx.y == 0 ) { val = 0 ; }
+
+      val += accum;
+      if ( i < loop_boundaries.end ) closure(i,val,true);
+      accum += local_total;
+    }
+  }
+  else {
+    value_type accum = 0 ;
+    for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+      closure(i,accum,true);
+    }
+  }
+}
+
+// scan within team member (vector) within warp
+// assume vec_length*team_size == warp_size
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Closure & closure )
+{
+  // Extract value_type from closure
+
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+
+    // make sure all threads perform all loop iterations
+    const iType bound = loop_boundaries.end + loop_boundaries.start ;
+
+    value_type accum = 0 ;
+    value_type val, y, local_total;
+
+    for( iType i = loop_boundaries.start; i < bound; i+=loop_boundaries.increment) {
+      val = 0;
+      if ( i < loop_boundaries.end ) closure(i,val,false);
+
+      // intra-blockDim.x exclusive scan on 'val'
+      // accum = accumulated, sum in total for this iteration
+
+      // INCLUSIVE scan
+      for( int offset = 1 ; offset < blockDim.x ; offset <<= 1 ) {
+        y = Kokkos::shfl_up(val, offset, blockDim.x);
+        if(threadIdx.x >= offset) { val += y; }
+      }
+
+      // pass accum to all threads
+      local_total = shfl_warp_broadcast<value_type>(val, blockDim.x-1, blockDim.x);
+
+      // make EXCLUSIVE scan by shifting values over one
+      val = Kokkos::shfl_up(val, 1, blockDim.x);
+      if ( threadIdx.x == 0 ) { val = 0 ; }
+
+      val += accum;
+      if ( i < loop_boundaries.end ) closure(i,val,true);
+      accum += local_total;
+    }
+  }
+  else {
+    value_type accum = 0 ;
+    for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+      closure(i,accum,true);
+    }
+  }
+}
+
+} /* namespace Kokkos */
+
+namespace Kokkos {
+
+  template<class FunctorType>
+  KOKKOS_INLINE_FUNCTION
+  void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& , const FunctorType& lambda) {
+#ifdef __CUDA_ARCH__
+    if(threadIdx.x == 0) lambda();
+#endif
+  }
+  
+  template<class FunctorType>
+  KOKKOS_INLINE_FUNCTION
+  void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& , const FunctorType& lambda) {
+#ifdef __CUDA_ARCH__
+    if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
+#endif
+  }
+  
+  template<class FunctorType, class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void single(const Impl::VectorSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& s , const FunctorType& lambda, ValueType& val) {
+#ifdef __CUDA_ARCH__
+    if(threadIdx.x == 0) lambda(val);
+    if ( 1 < s.team_member.team_size() ) {
+      val = shfl(val,0,blockDim.x);
+    }
+#endif
+  }
+  
+  template<class FunctorType, class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void single(const Impl::ThreadSingleStruct<Impl::TaskExec< Kokkos::Cuda > >& single_struct, const FunctorType& lambda, ValueType& val) {
+#ifdef __CUDA_ARCH__
+    if(threadIdx.x == 0 && threadIdx.y == 0) {
+      lambda(val);
+    }
+    single_struct.team_member.team_broadcast(val,0);
+#endif
+  }
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
+#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fd5e783411f8089b636a69b1c6706d212fcb841
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@@ -0,0 +1,996 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_TEAM_HPP
+#define KOKKOS_CUDA_TEAM_HPP
+
+#include <iostream>
+#include <algorithm>
+#include <stdio.h>
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+
+#include <utility>
+#include <Kokkos_Parallel.hpp>
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Kokkos_Vectorization.hpp>
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename Type >
+struct CudaJoinFunctor {
+  typedef Type value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    volatile const value_type & input )
+    { update += input ; }
+};
+
+/**\brief  Team member_type passed to TeamPolicy or TeamTask closures.
+ *
+ *  Cuda thread blocks for team closures are dimensioned as:
+ *    blockDim.x == number of "vector lanes" per "thread"
+ *    blockDim.y == number of "threads" per team
+ *    blockDim.z == number of teams in a block
+ *  where
+ *    A set of teams exactly fill a warp OR a team is the whole block
+ *      ( 0 == WarpSize % ( blockDim.x * blockDim.y ) )
+ *      OR
+ *      ( 1 == blockDim.z )
+ *
+ *  Thus when 1 < blockDim.z the team is warp-synchronous
+ *  and __syncthreads should not be called in team collectives.
+ *
+ *  When multiple teams are mapped onto a single block then the
+ *  total available shared memory must be partitioned among teams.
+ */
+class CudaTeamMember {
+private:
+
+  typedef Kokkos::Cuda                           execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+  mutable void        * m_team_reduce ;
+  scratch_memory_space  m_team_shared ;
+  int                   m_team_reduce_size ;
+  int                   m_league_rank ;
+  int                   m_league_size ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space &
+    team_scratch(const int& level) const
+      { return m_team_shared.set_team_thread_mode(level,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space &
+    thread_scratch(const int& level) const
+      { return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const
+    {
+      #ifdef __CUDA_ARCH__
+      return threadIdx.y ;
+      #else
+      return 0;
+      #endif
+    }
+
+  KOKKOS_INLINE_FUNCTION int team_size() const
+    {
+      #ifdef __CUDA_ARCH__
+      return blockDim.y ;
+      #else
+      return 1;
+      #endif
+    }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+    {
+      #ifdef __CUDA_ARCH__
+      if ( 1 == blockDim.z ) __syncthreads();       // team == block
+      else                   __threadfence_block(); // team <= warp
+      #endif
+    }
+
+  //--------------------------------------------------------------------------
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast( ValueType & val, const int& thread_id) const
+    {
+      #ifdef __CUDA_ARCH__
+      if ( 1 == blockDim.z ) { // team == block
+        __syncthreads();
+        // Wait for shared data write until all threads arrive here
+        if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) {
+          *((ValueType*) m_team_reduce) = val ;
+        }
+        __syncthreads(); // Wait for shared data read until root thread writes
+        val = *((ValueType*) m_team_reduce);
+      }
+      else { // team <= warp
+        ValueType tmp( val ); // input might not be a register variable
+        cuda_shfl( val, tmp, blockDim.x * thread_id, blockDim.x * blockDim.y );
+      }
+      #endif
+    }
+
+  //--------------------------------------------------------------------------
+  /**\brief  Reduction across a team
+   *
+   *  Mapping of teams onto blocks:
+   *    blockDim.x  is "vector lanes"
+   *    blockDim.y  is team "threads"
+   *    blockDim.z  is number of teams per block
+   *
+   *  Requires:
+   *    blockDim.x is power two
+   *    blockDim.x <= CudaTraits::WarpSize
+   *    ( 0 == CudaTraits::WarpSize % ( blockDim.x * blockDim.y )
+   *      OR
+   *    ( 1 == blockDim.z )
+   */
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  team_reduce( ReducerType const & reducer ) const noexcept
+    {
+      #ifdef __CUDA_ARCH__
+
+      typedef typename ReducerType::value_type value_type ;
+
+      value_type tmp( reducer.reference() );
+
+      // reduce within the warp using shuffle
+
+      const int wx =
+        ( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
+
+      for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
+
+        cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
+
+        // Root of each vector lane reduces:
+        if ( 0 == threadIdx.x && wx < i ) {
+          reducer.join( tmp , reducer.reference() );
+        }
+      }
+
+      if ( 1 < blockDim.z ) { // team <= warp
+        // broadcast result from root vector lange of root thread
+
+        cuda_shfl( reducer.reference() , tmp
+                 , blockDim.x * threadIdx.y , CudaTraits::WarpSize );
+
+      }
+      else { // team == block
+        // Reduce across warps using shared memory
+        // Broadcast result within block
+
+        // Number of warps, blockDim.y may not be power of two:
+        const int nw  = ( blockDim.x * blockDim.y + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
+
+        // Warp index:
+        const int wy = ( blockDim.x * threadIdx.y ) >> CudaTraits::WarpIndexShift ;
+
+        // Number of shared memory entries for the reduction:
+        int nsh = m_team_reduce_size / sizeof(value_type);
+
+        // Using at most one entry per warp:
+        if ( nw < nsh ) nsh = nw ;
+
+        __syncthreads(); // Wait before shared data write
+
+        if ( 0 == wx && wy < nsh ) {
+          ((value_type*) m_team_reduce)[wy] = tmp ;
+        }
+
+        // When more warps than shared entries:
+        for ( int i = nsh ; i < nw ; i += nsh ) {
+
+          __syncthreads();
+
+          if ( 0 == wx && i <= wy ) {
+            const int k = wy - i ;
+            if ( k < nsh ) {
+              reducer.join( *((value_type*) m_team_reduce + k) , tmp );
+            }
+          }
+        }
+
+        __syncthreads();
+
+        // One warp performs the inter-warp reduction:
+
+        if ( 0 == wy ) {
+
+          // Start at power of two covering nsh
+
+          for ( int i = 1 << ( 32 - __clz(nsh-1) ) ; ( i >>= 1 ) ; ) {
+            const int k = wx + i ;
+            if ( wx < i && k < nsh ) {
+              reducer.join( ((value_type*)m_team_reduce)[wx]
+                          , ((value_type*)m_team_reduce)[k] );
+              __threadfence_block();
+            }
+          }
+        }
+
+        __syncthreads(); // Wait for reduction
+
+        // Broadcast result to all threads
+        reducer.reference() = *((value_type*)m_team_reduce);
+      }
+
+      #endif /* #ifdef __CUDA_ARCH__ */
+    }
+
+  //--------------------------------------------------------------------------
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION
+  Type team_scan( const Type & value , Type * const global_accum ) const
+    {
+      #ifdef __CUDA_ARCH__
+      Type * const base_data = (Type *) m_team_reduce ;
+
+      __syncthreads(); // Don't write in to shared data until all threads have entered this function
+
+      if ( 0 == threadIdx.y ) { base_data[0] = 0 ; }
+
+      base_data[ threadIdx.y + 1 ] = value ;
+
+      Impl::cuda_intra_block_reduce_scan<true,Impl::CudaJoinFunctor<Type>,void>( Impl::CudaJoinFunctor<Type>() , base_data + 1 );
+
+      if ( global_accum ) {
+        if ( blockDim.y == threadIdx.y + 1 ) {
+          base_data[ blockDim.y ] = atomic_fetch_add( global_accum , base_data[ blockDim.y ] );
+        }
+        __syncthreads(); // Wait for atomic
+        base_data[ threadIdx.y ] += base_data[ blockDim.y ] ;
+      }
+
+      return base_data[ threadIdx.y ];
+      #else
+      return Type();
+      #endif
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const {
+    return this->template team_scan<Type>( value , 0 );
+  }
+
+  //----------------------------------------
+
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  vector_reduce( ReducerType const & reducer )
+    {
+
+      #ifdef __CUDA_ARCH__
+      if(blockDim.x == 1) return;
+
+      // Intra vector lane shuffle reduction:
+      typename ReducerType::value_type tmp ( reducer.reference() );
+
+      unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
+
+      for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
+        cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x , mask );
+        if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
+      }
+
+      // Broadcast from root lane to all other lanes.
+      // Cannot use "butterfly" algorithm to avoid the broadcast
+      // because floating point summation is not associative
+      // and thus different threads could have different results.
+
+      cuda_shfl( reducer.reference() , tmp , 0 , blockDim.x , mask );
+      #endif
+    }
+
+  //--------------------------------------------------------------------------
+  /**\brief  Global reduction across all blocks
+   *
+   *  Return !0 if reducer contains the final value
+   */
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< is_reducer< ReducerType >::value , int >::type
+  global_reduce( ReducerType const & reducer
+               , int  * const global_scratch_flags
+               , void * const global_scratch_space
+               , void * const shmem
+               , int    const shmem_size
+               )
+    {
+    #ifdef __CUDA_ARCH__
+
+      typedef typename ReducerType::value_type value_type ;
+      typedef value_type volatile * pointer_type ;
+
+      // Number of shared memory entries for the reduction:
+      const int nsh = shmem_size / sizeof(value_type);
+
+      // Number of CUDA threads in the block, rank within the block
+      const int nid = blockDim.x * blockDim.y * blockDim.z ;
+      const int tid = threadIdx.x + blockDim.x * (
+                      threadIdx.y + blockDim.y * threadIdx.z );
+
+      // Reduces within block using all available shared memory
+      // Contributes if it is the root "vector lane"
+
+      // wn == number of warps in the block
+      // wx == which lane within the warp
+      // wy == which warp within the block
+
+      const int wn = ( nid + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
+      const int wx = tid &  CudaTraits::WarpIndexMask ;
+      const int wy = tid >> CudaTraits::WarpIndexShift ;
+
+      //------------------------
+      { // Intra warp shuffle reduction from contributing CUDA threads
+
+        value_type tmp( reducer.reference() );
+
+        for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
+
+          cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );
+
+          // Root of each vector lane reduces "thread" contribution
+          if ( 0 == threadIdx.x && wx < i ) {
+            reducer.join( & tmp , reducer.data() );
+          }
+        }
+
+        // Reduce across warps using shared memory.
+        // Number of warps may not be power of two.
+
+        __syncthreads(); // Wait before shared data write
+
+        // Number of shared memory entries for the reduction
+        // is at most one per warp
+        const int nentry = wn < nsh ? wn : nsh ;
+
+        if ( 0 == wx && wy < nentry ) {
+          // Root thread of warp 'wy' has warp's value to contribute
+          ((value_type*) shmem)[wy] = tmp ;
+        }
+
+        __syncthreads(); // Wait for write to be visible to block
+
+        // When more warps than shared entries
+        // then warps must take turns joining their contribution
+        // to the designated shared memory entry.
+        for ( int i = nentry ; i < wn ; i += nentry ) {
+
+          const int k = wy - i ;
+
+          if ( 0 == wx && i <= wy && k < nentry ) {
+            // Root thread of warp 'wy' has warp's value to contribute
+            reducer.join( ((value_type*) shmem) + k , & tmp );
+          }
+
+          __syncthreads(); // Wait for write to be visible to block
+        }
+
+        // One warp performs the inter-warp reduction:
+
+        if ( 0 == wy ) {
+
+          // Start fan-in at power of two covering nentry
+
+          for ( int i = ( 1 << ( 32 - __clz(nentry-1) ) ) ; ( i >>= 1 ) ; ) {
+            const int k = wx + i ;
+            if ( wx < i && k < nentry ) {
+              reducer.join( ((pointer_type)shmem) + wx
+                          , ((pointer_type)shmem) + k );
+              __threadfence_block(); // Wait for write to be visible to warp
+            }
+          }
+        }
+      }
+      //------------------------
+      { // Write block's value to global_scratch_memory
+
+        int last_block = 0 ;
+
+        if ( 0 == wx ) {
+          reducer.copy( ((pointer_type)global_scratch_space)
+                        + blockIdx.x * reducer.length()
+                      , reducer.data() );
+
+          __threadfence(); // Wait until global write is visible.
+
+          last_block = (int)gridDim.x ==
+                       1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);
+
+          // If last block then reset count
+          if ( last_block ) *global_scratch_flags = 0 ;
+        }
+
+        last_block = __syncthreads_or( last_block );
+
+        if ( ! last_block ) return 0 ;
+
+      }
+      //------------------------
+      // Last block reads global_scratch_memory into shared memory.
+
+      const int nentry = nid < gridDim.x ?
+                       ( nid       < nsh ? nid       : nsh ) :
+                       ( gridDim.x < nsh ? gridDim.x : nsh ) ;
+
+      // nentry = min( nid , nsh , gridDim.x )
+
+      // whole block reads global memory into shared memory:
+
+      if ( tid < nentry ) {
+
+        const int offset = tid * reducer.length();
+
+        reducer.copy( ((pointer_type)shmem) + offset
+                    , ((pointer_type)global_scratch_space) + offset );
+
+        for ( int i = nentry + tid ; i < (int)gridDim.x ; i += nentry ) {
+          reducer.join( ((pointer_type)shmem) + offset
+                      , ((pointer_type)global_scratch_space)
+                        + i * reducer.length() );
+        }
+      }
+
+      __syncthreads(); // Wait for writes to be visible to block
+
+      if ( 0 == wy ) {
+
+        // Iterate to reduce shared memory to single warp fan-in size
+
+        const int nreduce = CudaTraits::WarpSize < nentry
+                          ? CudaTraits::WarpSize : nentry ;
+
+        // nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x )
+
+        if ( wx < nreduce && nreduce < nentry ) {
+          for ( int i = nreduce + wx ; i < nentry ; i += nreduce ) {
+            reducer.join( ((pointer_type)shmem) + wx
+                        , ((pointer_type)shmem) + i );
+          }
+          __threadfence_block(); // Wait for writes to be visible to warp
+        }
+
+        // Start fan-in at power of two covering nentry
+
+        for ( int i = ( 1 << ( 32 - __clz(nreduce-1) ) ) ; ( i >>= 1 ) ; ) {
+          const int k = wx + i ;
+          if ( wx < i && k < nreduce ) {
+            reducer.join( ((pointer_type)shmem) + wx
+                        , ((pointer_type)shmem) + k );
+            __threadfence_block(); // Wait for writes to be visible to warp
+          }
+        }
+
+        if ( 0 == wx ) {
+          reducer.copy( reducer.data() , (pointer_type)shmem );
+          return 1 ;
+        }
+      }
+      return 0 ;
+
+    #else
+      return 0 ;
+    #endif
+    }
+
+  //----------------------------------------
+  // Private for the driver
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTeamMember( void * shared
+                , const int shared_begin
+                , const int shared_size
+                , void*     scratch_level_1_ptr
+                , const int scratch_level_1_size
+                , const int arg_league_rank
+                , const int arg_league_size )
+    : m_team_reduce( shared )
+    , m_team_shared( ((char *)shared) + shared_begin , shared_size,  scratch_level_1_ptr, scratch_level_1_size)
+    , m_team_reduce_size( shared_begin )
+    , m_league_rank( arg_league_rank )
+    , m_league_size( arg_league_size )
+    {}
+
+public:
+  // Declare to avoid unused private member warnings which are trigger
+  // when SFINAE excludes the member function which uses these variables
+  // Making another class a friend also surpresses these warnings
+  bool impl_avoid_sfinae_warning() const noexcept
+  {
+    return m_team_reduce_size > 0 && m_team_reduce != nullptr;
+  }
+};
+
+} // namspace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<typename iType>
+struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> {
+  typedef iType index_type;
+  const CudaTeamMember& member;
+  const iType start;
+  const iType end;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count)
+    : member(thread_)
+    , start( 0 )
+    , end( count ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_,  const iType& begin_, const iType& end_)
+    : member(thread_)
+    , start( begin_ )
+    , end( end_ ) {}
+};
+
+
+
+template<typename iType>
+struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
+  typedef iType index_type;
+  const iType start;
+  const iType end;
+
+  KOKKOS_INLINE_FUNCTION
+  ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const iType& count)
+    : start( 0 ), end( count ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ThreadVectorRangeBoundariesStruct (const iType& count)
+    : start( 0 ), end( count ) {}
+};
+
+} // namespace Impl
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >
+TeamThreadRange( const Impl::CudaTeamMember & thread, const iType & count ) {
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
+}
+
+template< typename iType1, typename iType2 >
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
+                                       Impl::CudaTeamMember >
+TeamThreadRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
+  typedef typename std::common_type< iType1, iType2 >::type iType;
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
+ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::CudaTeamMember> PerTeam(const Impl::CudaTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::CudaTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::CudaTeamMember> PerThread(const Impl::CudaTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::CudaTeamMember>(thread);
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Inter-thread parallel_for.
+ *
+ *  Executes closure(iType i) for each i=[0..N).
+ *
+ * The range [0..N) is mapped to all threads of the the calling thread team.
+ */
+template<typename iType, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
+      loop_boundaries
+  , const Closure & closure
+  )
+{
+  #ifdef __CUDA_ARCH__
+  for( iType i = loop_boundaries.start + threadIdx.y
+     ; i < loop_boundaries.end
+     ; i += blockDim.y )
+    closure(i);
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Inter-thread parallel_reduce with a reducer.
+ *
+ *  Executes closure(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all threads of the
+ *  calling thread team and a summation of val is
+ *  performed and put into result.
+ */
+template< typename iType, class Closure, class ReducerType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
+parallel_reduce
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
+      loop_boundaries
+  , const Closure & closure
+  , const ReducerType & reducer
+  )
+{
+#ifdef __CUDA_ARCH__
+
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start + threadIdx.y
+     ; i < loop_boundaries.end
+     ; i += blockDim.y ) {
+    closure(i,reducer.reference());
+  }
+
+  loop_boundaries.member.team_reduce( reducer );
+
+#endif
+}
+
+
+/** \brief  Inter-thread parallel_reduce assuming summation.
+ *
+ *  Executes closure(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all threads of the
+ *  calling thread team and a summation of val is
+ *  performed and put into result.
+ */
+template< typename iType, class Closure, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< ! Kokkos::is_reducer< ValueType >::value >::type
+parallel_reduce
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
+      loop_boundaries
+  , const Closure & closure
+  , ValueType & result
+  )
+{
+#ifdef __CUDA_ARCH__
+
+  Kokkos::Experimental::Sum<ValueType> reducer(result);
+
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start + threadIdx.y
+     ; i < loop_boundaries.end
+     ; i += blockDim.y ) {
+    closure(i,result);
+  }
+
+  loop_boundaries.member.team_reduce( reducer );
+
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Intra-thread vector parallel_for.
+ *
+ *  Executes closure(iType i) for each i=[0..N)
+ *
+ * The range [0..N) is mapped to all vector lanes of the the calling thread.
+ */
+template<typename iType, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
+    loop_boundaries
+  , const Closure & closure
+  )
+{
+#ifdef __CUDA_ARCH__
+  for ( iType i = loop_boundaries.start + threadIdx.x
+      ; i < loop_boundaries.end
+      ; i += blockDim.x ) {
+    closure(i);
+  }
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Intra-thread vector parallel_reduce.
+ *
+ *  Calls closure(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed using +=
+ *  and output into result.
+ *
+ *  The identity value for the += operator is assumed to be the default
+ *  constructed value.
+ */
+template< typename iType, class Closure, class ReducerType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< is_reducer< ReducerType >::value >::type
+parallel_reduce
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>
+      const & loop_boundaries
+  , Closure const & closure
+  , ReducerType const & reducer )
+{
+#ifdef __CUDA_ARCH__
+
+  reducer.init( reducer.reference() );
+
+  for ( iType i = loop_boundaries.start + threadIdx.x
+      ; i < loop_boundaries.end
+      ; i += blockDim.x ) {
+    closure(i,reducer.reference());
+  }
+
+  Impl::CudaTeamMember::vector_reduce( reducer );
+
+#endif
+}
+
+/** \brief  Intra-thread vector parallel_reduce.
+ *
+ *  Calls closure(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed using +=
+ *  and output into result.
+ *
+ *  The identity value for the += operator is assumed to be the default
+ *  constructed value.
+ */
+template< typename iType, class Closure, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< ! is_reducer< ValueType >::value >::type
+parallel_reduce
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>
+      const & loop_boundaries
+  , Closure const & closure
+  , ValueType & result )
+{
+#ifdef __CUDA_ARCH__
+  result = ValueType();
+
+  for ( iType i = loop_boundaries.start + threadIdx.x
+      ; i < loop_boundaries.end
+      ; i += blockDim.x ) {
+    closure(i,result);
+  }
+
+  Impl::CudaTeamMember::vector_reduce(
+    Kokkos::Experimental::Sum<ValueType>(result ) );
+
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum.
+ *
+ *  Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes in the
+ *  thread and a scan operation is performed.
+ *  The last call to closure has final == true.
+ */
+template< typename iType, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  ( const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
+      loop_boundaries
+  , const Closure & closure
+  )
+{
+
+#ifdef __CUDA_ARCH__
+
+  // Extract value_type from closure
+
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  // Loop through boundaries by vector-length chunks
+  // must scan at each iteration
+
+  value_type accum = 0 ;
+
+  // All thread "lanes" must loop the same number of times.
+  // Determine an loop end for all thread "lanes."
+  // Requires:
+  //   blockDim.x is power of two and thus
+  //     ( end % blockDim.x ) == ( end & ( blockDim.x - 1 ) )
+  //   1 <= blockDim.x <= CudaTraits::WarpSize
+
+  const int mask = blockDim.x - 1 ;
+  const unsigned active_mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
+  const int rem  = loop_boundaries.end & mask ; // == end % blockDim.x
+  const int end  = loop_boundaries.end + ( rem ? blockDim.x - rem : 0 );
+
+  for ( int i = threadIdx.x ; i < end ; i += blockDim.x ) {
+
+    value_type val = 0 ;
+
+    // First acquire per-lane contributions:
+    if ( i < loop_boundaries.end ) closure( i , val , false );
+
+    value_type sval = val ;
+
+    // Bottom up inclusive scan in triangular pattern
+    // where each CUDA thread is the root of a reduction tree
+    // from the zeroth "lane" to itself.
+    //  [t] += [t-1] if t >= 1
+    //  [t] += [t-2] if t >= 2
+    //  [t] += [t-4] if t >= 4
+    //  ...
+
+    for ( int j = 1 ; j < (int)blockDim.x ; j <<= 1 ) {
+      value_type tmp = 0 ;
+      Impl::cuda_shfl_up(tmp, sval , j , blockDim.x, active_mask );
+      if ( j <= (int)threadIdx.x ) { sval += tmp ; }
+    }
+
+    // Include accumulation and remove value for exclusive scan:
+    val = accum + sval - val ;
+
+    // Provide exclusive scan value:
+    if ( i < loop_boundaries.end ) closure( i , val , true );
+
+    // Accumulate the last value in the inclusive scan:
+    Impl::cuda_shfl( sval , sval , mask , blockDim.x, active_mask );
+
+    accum += sval ;
+  }
+
+#endif
+}
+
+}
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0) lambda();
+  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
+#endif
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
+  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
+#endif
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda, ValueType& val) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0) lambda(val);
+  unsigned mask = blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x;
+  Impl::cuda_shfl(val,val,0,blockDim.x,mask);
+#endif
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0 && threadIdx.y == 0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+#endif
+}
+
+} // namespace Kokkos
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* #ifndef KOKKOS_CUDA_TEAM_HPP */
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..82fa4e6b89838e1648f225233842977297f15428
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
@@ -0,0 +1,133 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+#define KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <Kokkos_CudaSpace.hpp>
+#include <Kokkos_UniqueToken.hpp>
+#include <impl/Kokkos_SharedAlloc.hpp>
+#include <impl/Kokkos_ConcurrentBitset.hpp>
+
+namespace Kokkos { namespace Experimental {
+
+// both global and instance Unique Tokens are implemented in the same way
+template<>
+class UniqueToken< Cuda, UniqueTokenScope::Global >
+{
+private:
+
+  uint32_t volatile * m_buffer ;
+  uint32_t            m_count ;
+
+public:
+
+  using execution_space = Cuda;
+
+  explicit
+  UniqueToken( execution_space const& );
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken() : m_buffer(0), m_count(0) {}
+
+  KOKKOS_FUNCTION_DEFAULTED
+  UniqueToken( const UniqueToken & ) = default;
+
+  KOKKOS_FUNCTION_DEFAULTED
+  UniqueToken( UniqueToken && )      = default;
+
+  KOKKOS_FUNCTION_DEFAULTED
+  UniqueToken & operator=( const UniqueToken & ) = default ;
+
+  KOKKOS_FUNCTION_DEFAULTED
+  UniqueToken & operator=( UniqueToken && ) = default ;
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int32_t size() const noexcept { return m_count ; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int32_t acquire() const
+  {
+    const Kokkos::pair<int,int> result =
+      Kokkos::Impl::concurrent_bitset::
+        acquire_bounded( m_buffer
+                       , m_count
+                       , Kokkos::Impl::clock_tic() % m_count
+                       );
+
+   if ( result.first < 0 ) {
+     Kokkos::abort("UniqueToken<Cuda> failure to release tokens, no tokens available" );
+   }
+
+    return result.first;
+  }
+
+  /// \brief release an acquired value
+  KOKKOS_INLINE_FUNCTION
+  void release( int32_t i ) const noexcept
+  {
+    Kokkos::Impl::concurrent_bitset::release( m_buffer, i );
+  }
+};
+
+template<>
+class UniqueToken< Cuda, UniqueTokenScope::Instance >
+  : public UniqueToken< Cuda, UniqueTokenScope::Global >
+{
+public:
+
+  explicit
+  UniqueToken( execution_space const& arg )
+    : UniqueToken< Cuda, UniqueTokenScope::Global >( arg ) {}
+};
+
+}} // namespace Kokkos::Experimental
+
+#endif // KOKKOS_ENABLE_CUDA
+#endif // KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c41cfc38d8300ec342e305f54b3728075e4d9750
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@@ -0,0 +1,297 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_CUDA_VECTORIZATION_HPP
+#define KOKKOS_CUDA_VECTORIZATION_HPP
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <Kokkos_Cuda.hpp>
+#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
+namespace Kokkos {
+
+
+// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs
+// or other GPUs.  We provide a generic definition (which is trivial
+// and doesn't do what it claims to do) because we don't actually use
+// this function unless we are on a suitable GPU, with a suitable
+// Scalar type.  (For example, in the mat-vec, the "ThreadsPerRow"
+// internal parameter depends both on the ExecutionSpace and the Scalar type,
+// and it controls whether shfl_down() gets called.)
+namespace Impl {
+
+  template< typename Scalar >
+  struct shfl_union {
+    enum {n = sizeof(Scalar)/4};
+    float fval[n];
+    KOKKOS_INLINE_FUNCTION
+    Scalar value() {
+      return *(Scalar*) fval;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (const Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+
+  };
+}
+
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl(const int &val, const int& srcLane, const int& width ) {
+      return KOKKOS_IMPL_CUDA_SHFL(val,srcLane,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl(const float &val, const int& srcLane, const int& width ) {
+      return KOKKOS_IMPL_CUDA_SHFL(val,srcLane,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
+        ) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = KOKKOS_IMPL_CUDA_SHFL(tmp,srcLane,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl(const double &val, const int& srcLane, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = KOKKOS_IMPL_CUDA_SHFL(lo,srcLane,width);
+      hi = KOKKOS_IMPL_CUDA_SHFL(hi,srcLane,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = KOKKOS_IMPL_CUDA_SHFL(lo,srcLane,width);
+      hi = KOKKOS_IMPL_CUDA_SHFL(hi,srcLane,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = KOKKOS_IMPL_CUDA_SHFL(s_val.fval[i],srcLane,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_down(const int &val, const int& delta, const int& width) {
+      return KOKKOS_IMPL_CUDA_SHFL_DOWN(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_down(const float &val, const int& delta, const int& width) {
+      return KOKKOS_IMPL_CUDA_SHFL_DOWN(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = KOKKOS_IMPL_CUDA_SHFL_DOWN(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_down(const double &val, const int& delta, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = KOKKOS_IMPL_CUDA_SHFL_DOWN(lo,delta,width);
+      hi = KOKKOS_IMPL_CUDA_SHFL_DOWN(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = KOKKOS_IMPL_CUDA_SHFL_DOWN(lo,delta,width);
+      hi = KOKKOS_IMPL_CUDA_SHFL_DOWN(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = KOKKOS_IMPL_CUDA_SHFL_DOWN(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_up(const int &val, const int& delta, const int& width ) {
+      return KOKKOS_IMPL_CUDA_SHFL_UP(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_up(const float &val, const int& delta, const int& width ) {
+      return KOKKOS_IMPL_CUDA_SHFL_UP(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = KOKKOS_IMPL_CUDA_SHFL_UP(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_up(const double &val, const int& delta, const int& width ) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = KOKKOS_IMPL_CUDA_SHFL_UP(lo,delta,width);
+      hi = KOKKOS_IMPL_CUDA_SHFL_UP(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = KOKKOS_IMPL_CUDA_SHFL_UP(lo,delta,width);
+      hi = KOKKOS_IMPL_CUDA_SHFL_UP(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = KOKKOS_IMPL_CUDA_SHFL_UP(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+  #else
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+  #endif
+#else
+    template<typename Scalar>
+    inline
+    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+#endif
+
+
+
+}
+
+#endif // KOKKOS_ENABLE_CUDA
+#endif
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2954223f98ed3e7d0d978011f5a9e7b8ba2c3339
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
@@ -0,0 +1,70 @@
+#include<Kokkos_Macros.hpp>
+#if defined( __CUDA_ARCH__ )
+#if ( CUDA_VERSION < 9000 )
+#define KOKKOS_IMPL_CUDA_SYNCWARP __threadfence_block()
+#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(x) __threadfence_block()
+#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot(x)
+#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl(x,y,z)
+#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) __shfl(x,y,z)
+#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up(x,y,z)
+#define KOKKOS_IMPL_CUDA_SHFL_UP_MASK(m,x,y,z) __shfl_up(x,y,z)
+#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x,y,z) __shfl_down(x,y,z)
+#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m,x,y,z) __shfl_down(x,y,z)
+#else
+#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff)
+#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m)
+#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(0xffffffff,x)
+#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl_sync(0xffffffff,x,y,z)
+#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) __shfl_sync(m,x,y,z)
+#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up_sync(0xffffffff,x,y,z)
+#define KOKKOS_IMPL_CUDA_SHFL_UP_MASK(m,x,y,z) __shfl_up_sync(m,x,y,z)
+#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x,y,z) __shfl_down_sync(0xffffffff,x,y,z)
+#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m,x,y,z) __shfl_down_sync(m,x,y,z)
+#endif 
+#else
+#define KOKKOS_IMPL_CUDA_SYNCWARP 
+#define KOKKOS_IMPL_CUDA_BALLOT(x) 0
+#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) 0
+#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) 0
+#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x,y,z) 0
+#endif 
+
+#if defined( __CUDA_ARCH__ )
+#if ( CUDA_VERSION < 9000 )
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( MSG ) { \
+  const unsigned b = __ballot(1); \
+  if ( b != 0xffffffff ) { \
+    printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n" \
+      , MSG \
+      , blockIdx.x \
+      , blockIdx.y \
+      , blockIdx.z \
+      , threadIdx.x \
+      , threadIdx.y \
+      , threadIdx.z \
+      , b ); \
+    return ; \
+  } \
+}
+#else
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( MSG ) { \
+  __syncwarp(); \
+  const unsigned b = __activemask(); \
+  if ( b != 0xffffffff ) { \
+    printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n" \
+      , MSG \
+      , blockIdx.x \
+      , blockIdx.y \
+      , blockIdx.z \
+      , threadIdx.x \
+      , threadIdx.y \
+      , threadIdx.z \
+      , b ); \
+    return ; \
+  } \
+}
+#endif 
+#else
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( MSG ) 
+#endif 
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..49b11f3ae0a4a0fcc9b462412ad2b0fa9baf16cf
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@@ -0,0 +1,316 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
+#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_CUDA )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
+// Via reinterpret_case this can be used to support all scalar types of those sizes.
+// Any other scalar type falls back to either normal reads out of global memory,
+// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
+
+template< typename ValueType , typename AliasType >
+struct CudaTextureFetch {
+
+  ::cudaTextureObject_t   m_obj ;
+  const ValueType       * m_ptr ;
+  int                     m_offset ;
+
+  // Deference operator pulls through texture object and returns by value
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+    {
+#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+      AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
+      return  *(reinterpret_cast<ValueType*> (&v));
+#else
+      return m_ptr[ i ];
+#endif
+    }
+
+  // Pointer to referenced memory
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_ptr ; }
+
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaTextureFetch() {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const CudaTextureFetch & rhs )
+    : m_obj(     rhs.m_obj )
+    , m_ptr(     rhs.m_ptr )
+    , m_offset(  rhs.m_offset )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( CudaTextureFetch && rhs )
+    : m_obj(     rhs.m_obj )
+    , m_ptr(     rhs.m_ptr )
+    , m_offset(  rhs.m_offset )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
+    {
+      m_obj     = rhs.m_obj ;
+      m_ptr     = rhs.m_ptr ;
+      m_offset  = rhs.m_offset ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( CudaTextureFetch && rhs )
+    {
+      m_obj     = rhs.m_obj ;
+      m_ptr     = rhs.m_ptr ;
+      m_offset  = rhs.m_offset ;
+      return *this ;
+    }
+
+  // Texture object spans the entire allocation.
+  // This handle may view a subset of the allocation, so an offset is required.
+  template< class CudaMemorySpace >
+  inline explicit
+  CudaTextureFetch( const ValueType * const arg_ptr
+                  , Kokkos::Impl::SharedAllocationRecord< CudaMemorySpace , void > * record
+                  )
+    : m_obj( record->template attach_texture_object< AliasType >() )
+    , m_ptr( arg_ptr )
+    , m_offset( record->attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
+    {}
+
+  // Texture object spans the entire allocation.
+  // This handle may view a subset of the allocation, so an offset is required.
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const CudaTextureFetch & rhs , size_t offset )
+    : m_obj(     rhs.m_obj )
+    , m_ptr(     rhs.m_ptr + offset)
+    , m_offset( offset + rhs.m_offset )
+    {}
+};
+
+#if defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
+
+template< typename ValueType , typename AliasType >
+struct CudaLDGFetch {
+
+  const ValueType * m_ptr ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+    {
+      #ifdef __CUDA_ARCH__
+      AliasType v = __ldg(reinterpret_cast<const AliasType*>(&m_ptr[i]));
+      return  *(reinterpret_cast<ValueType*> (&v));
+      #else
+      return m_ptr[i];
+      #endif
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_ptr ; }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch() : m_ptr() {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaLDGFetch() {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch( const CudaLDGFetch & rhs )
+    : m_ptr( rhs.m_ptr )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch( CudaLDGFetch && rhs )
+    : m_ptr( rhs.m_ptr )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch & operator = ( const CudaLDGFetch & rhs )
+    {
+      m_ptr = rhs.m_ptr ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch & operator = ( CudaLDGFetch && rhs )
+    {
+      m_ptr = rhs.m_ptr ;
+      return *this ;
+    }
+
+  template< class CudaMemorySpace >
+  inline explicit
+  CudaLDGFetch( const ValueType * const arg_ptr
+              , Kokkos::Impl::SharedAllocationRecord<CudaMemorySpace,void>*
+              )
+    : m_ptr( arg_ptr )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch( CudaLDGFetch const rhs ,size_t offset)
+    : m_ptr( rhs.m_ptr + offset )
+    {}
+
+};
+
+#endif
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
+ *          if 'const' value type, CudaSpace and random access.
+ */
+template< class Traits >
+class ViewDataHandle< Traits ,
+  typename std::enable_if<(
+    // Is Cuda memory space
+    ( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
+      std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value )
+    &&
+    // Is a trivial const value of 4, 8, or 16 bytes
+    std::is_trivial<typename Traits::const_value_type>::value
+    &&
+    std::is_same<typename Traits::const_value_type,typename Traits::value_type>::value
+    &&
+    ( sizeof(typename Traits::const_value_type) ==  4 ||
+      sizeof(typename Traits::const_value_type) ==  8 ||
+      sizeof(typename Traits::const_value_type) == 16 )
+    &&
+    // Random access trait
+    ( Traits::memory_traits::RandomAccess != 0 )
+  )>::type >
+{
+public:
+
+  using track_type  = Kokkos::Impl::SharedAllocationTracker ;
+
+  using value_type  = typename Traits::const_value_type ;
+  using return_type = typename Traits::const_value_type ; // NOT a reference
+
+  using alias_type = typename std::conditional< ( sizeof(value_type) ==  4 ) , int ,
+                     typename std::conditional< ( sizeof(value_type) ==  8 ) , ::int2 ,
+                     typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void
+                     >::type
+                     >::type
+                     >::type ;
+
+#if defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
+  using handle_type = Kokkos::Impl::CudaLDGFetch< value_type , alias_type > ;
+#else
+  using handle_type = Kokkos::Impl::CudaTextureFetch< value_type , alias_type > ;
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ )
+    {
+      return arg_handle ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type const assign( handle_type const & arg_handle , size_t offset )
+    {
+      return handle_type(arg_handle,offset) ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      // Assignment of texture = non-texture requires creation of a texture object
+      // which can only occur on the host.  In addition, 'get_record' is only valid
+      // if called in a host execution space
+
+
+      typedef typename Traits::memory_space memory_space ;
+      typedef typename Impl::SharedAllocationRecord<memory_space,void> record ;
+
+      record * const r = arg_tracker.template get_record< memory_space >();
+
+#if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
+      if ( 0 == r ) {
+        Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
+      }
+#endif
+
+      return handle_type( arg_data_ptr , r );
+
+#else
+      Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
+      return handle_type();
+#endif
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
+
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..94e293d7c7b1658bf13358edab19ec536243acf8
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -0,0 +1,121 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
+#define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::WorkGraphPolicy< Traits ... >
+                 , Kokkos::Cuda
+                 >
+{
+public:
+
+  typedef Kokkos::WorkGraphPolicy< Traits ... >   Policy ;
+  typedef ParallelFor<FunctorType, Policy, Kokkos::Cuda>        Self ;
+
+private:
+
+  Policy       m_policy ;
+  FunctorType  m_functor ;
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{} ; m_functor( t , w ); }
+
+public:
+
+  __device__ inline
+  void operator()() const noexcept
+    {
+      if ( 0 == ( threadIdx.y % 16 ) ) {
+
+        // Spin until COMPLETED_TOKEN.
+        // END_TOKEN indicates no work is currently available.
+
+        for ( std::int32_t w = Policy::END_TOKEN ;
+              Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+          if ( Policy::END_TOKEN != w ) {
+            exec_one< typename Policy::work_tag >( w );
+            m_policy.completed_work(w);
+          }
+        }
+      }
+    }
+
+  inline
+  void execute()
+  {
+    const int warps_per_block = 4 ;
+    const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
+    const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
+    const int shared = 0 ;
+    const cudaStream_t stream = 0 ;
+
+    Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, stream);
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+  {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP */
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f5415b5119d3091e960d13572495243a20c38ea
--- /dev/null
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
@@ -0,0 +1,89 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ABORT_HPP
+#define KOKKOS_CUDA_ABORT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+#include <Kokkos_Macros.hpp>
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+
+#include <cuda.h>
+
+extern "C" {
+/*  Cuda runtime function, declared in <crt/device_runtime.h>
+ *  Requires capability 2.x or better.
+ */
+extern __device__ void __assertfail(
+  const void  *message,
+  const void  *file,
+  unsigned int line,
+  const void  *function,
+  size_t       charsize);
+}
+
+namespace Kokkos {
+namespace Impl {
+
+__device__ inline
+void cuda_abort( const char * const message )
+{
+#ifndef __APPLE__
+  const char empty[] = "" ;
+
+  __assertfail( (const void *) message ,
+                (const void *) empty ,
+                (unsigned int) 0 ,
+                (const void *) empty ,
+                sizeof(char) );
+#endif
+}
+
+} // namespace Impl
+} // namespace Kokkos
+#else
+void KOKKOS_CORE_SRC_CUDA_ABORT_PREVENT_LINK_ERROR() {}
+#endif /* #if defined(__CUDACC__) && defined( KOKKOS_ENABLE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */
+
diff --git a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7f7f7981dd605883347b5d406025820c9413725
--- /dev/null
+++ b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -0,0 +1,546 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+
+#include <initializer_list>
+
+#include<impl/KokkosExp_Host_IterateTile.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_Parallel.hpp>
+
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+#include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
+#include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
+#endif
+
+#if defined( __HCC__ ) && defined( KOKKOS_ENABLE_ROCM )
+//#include<ROCm/KokkosExp_ROCm_IterateTile.hpp>
+#include <ROCm/KokkosExp_ROCm_IterateTile_Refactor.hpp>
+#endif
+
+namespace Kokkos {
+
+// ------------------------------------------------------------------ //
+
+enum class Iterate
+{
+  Default, // Default for the device
+  Left,    // Left indices stride fastest
+  Right,   // Right indices stride fastest
+};
+
+template <typename ExecSpace>
+struct default_outer_direction
+{
+  using type = Iterate;
+  #if defined( KOKKOS_ENABLE_CUDA)
+  static constexpr Iterate value = Iterate::Left;
+  #else
+  static constexpr Iterate value = Iterate::Right;
+  #endif
+};
+
+template <typename ExecSpace>
+struct default_inner_direction
+{
+  using type = Iterate;
+  #if defined( KOKKOS_ENABLE_CUDA)
+  static constexpr Iterate value = Iterate::Left;
+  #else
+  static constexpr Iterate value = Iterate::Right;
+  #endif
+};
+
+
+// Iteration Pattern
+template < unsigned N
+         , Iterate OuterDir = Iterate::Default
+         , Iterate InnerDir = Iterate::Default
+         >
+struct Rank
+{
+  static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
+  static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
+  static_assert( N < 7u, "Kokkos Error: Unsupported rank...");
+
+  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
+
+  static constexpr int rank = N;
+  static constexpr Iterate outer_direction = OuterDir;
+  static constexpr Iterate inner_direction = InnerDir;
+};
+
+
+// multi-dimensional iteration pattern
+template <typename... Properties>
+struct MDRangePolicy
+  : public Kokkos::Impl::PolicyTraits<Properties ...>
+{
+  using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
+  using range_policy = RangePolicy<Properties...>;
+
+  using impl_range_policy = RangePolicy< typename traits::execution_space
+                                       , typename traits::schedule_type
+                                       , typename traits::index_type
+                                       > ;
+
+  typedef MDRangePolicy execution_policy; // needed for is_execution_space interrogation
+
+  static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
+               , "Kokkos Error: MD iteration pattern not defined" );
+
+  using iteration_pattern   = typename traits::iteration_pattern;
+  using work_tag            = typename traits::work_tag;
+  using launch_bounds       = typename traits::launch_bounds;
+  using member_type = typename range_policy::member_type;
+
+  enum { rank = static_cast<int>(iteration_pattern::rank) };
+
+  using index_type  = typename traits::index_type;
+  using array_index_type = long;
+  using point_type  = Kokkos::Array<array_index_type,rank>; //was index_type
+  using tile_type   = Kokkos::Array<array_index_type,rank>;
+  // If point_type or tile_type is not templated on a signed integral type (if it is unsigned), 
+  // then if user passes in intializer_list of runtime-determined values of 
+  // signed integral type that are not const will receive a compiler error due 
+  // to an invalid case for implicit conversion - 
+  // "conversion from integer or unscoped enumeration type to integer type that cannot represent all values of the original, except where source is a constant expression whose value can be stored exactly in the target type"
+  // This would require the user to either pass a matching index_type parameter
+  // as template parameter to the MDRangePolicy or static_cast the individual values
+
+  point_type m_lower;
+  point_type m_upper;
+  tile_type  m_tile;
+  point_type m_tile_end;
+  index_type m_num_tiles;
+  index_type m_prod_tile_dims;
+
+/*
+  // NDE enum impl definition alternative - replace static constexpr int ? 
+  enum { outer_direction = static_cast<int> (
+      (iteration_pattern::outer_direction != Iterate::Default)
+    ? iteration_pattern::outer_direction
+    : default_outer_direction< typename traits::execution_space>::value ) };
+
+  enum { inner_direction = static_cast<int> (
+      iteration_pattern::inner_direction != Iterate::Default
+    ? iteration_pattern::inner_direction
+    : default_inner_direction< typename traits::execution_space>::value ) };
+
+  enum { Right = static_cast<int>( Iterate::Right ) };
+  enum { Left  = static_cast<int>( Iterate::Left ) };
+*/
+  //static constexpr int rank = iteration_pattern::rank;
+
+  static constexpr int outer_direction = static_cast<int> (
+      (iteration_pattern::outer_direction != Iterate::Default)
+    ? iteration_pattern::outer_direction
+    : default_outer_direction< typename traits::execution_space>::value );
+
+  static constexpr int inner_direction = static_cast<int> (
+      iteration_pattern::inner_direction != Iterate::Default
+    ? iteration_pattern::inner_direction
+    : default_inner_direction< typename traits::execution_space>::value ) ;
+
+  // Ugly ugly workaround intel 14 not handling scoped enum correctly
+  static constexpr int Right = static_cast<int>( Iterate::Right );
+  static constexpr int Left  = static_cast<int>( Iterate::Left );
+
+  MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
+    : m_lower(lower)
+    , m_upper(upper)
+    , m_tile(tile)
+    , m_num_tiles(1)
+    , m_prod_tile_dims(1)
+  {
+    // Host
+    if ( true
+       #if defined(KOKKOS_ENABLE_CUDA)
+         && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
+       #endif
+       #if defined(KOKKOS_ENABLE_ROCM)
+         && !std::is_same< typename traits::execution_space, Kokkos::Experimental::ROCm >::value
+       #endif
+       )
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = upper[i] - lower[i];
+        if ( m_tile[i] <= 0 ) {
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = (span == 0 ? 1 : span);
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
+      }
+    }
+    #if defined(KOKKOS_ENABLE_CUDA)
+    else // Cuda
+    {
+      index_type span;
+      int increment = 1;
+      int rank_start = 0;
+      int rank_end = rank;
+      if((int)inner_direction == (int)Right) {
+        increment = -1;
+        rank_start = rank-1;
+        rank_end = -1;
+      }
+      for (int i=rank_start; i!=rank_end; i+=increment) {
+        span = m_upper[i] - m_lower[i];
+        if ( m_tile[i] <= 0 ) {
+          // TODO: determine what is a good default tile size for cuda
+          // may be rank dependent
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
+          {
+            if ( m_prod_tile_dims < 256 ) {
+              m_tile[i] = 2;
+            } else {
+              m_tile[i] = 1;
+            }
+          }
+          else {
+            m_tile[i] = 16;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
+      }
+      if ( m_prod_tile_dims > 1024 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
+        printf(" Tile dimensions exceed Cuda limits\n");
+        Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+        //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+      }
+    }
+    #endif
+  }
+
+
+  template < typename LT , typename UT , typename TT = array_index_type >
+  MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
+  {
+
+    if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
+      Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
+
+    for ( auto i = 0; i < rank; ++i ) {
+      m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
+      m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
+      if(static_cast<int>(tile.size())==rank)
+        m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
+      else
+        m_tile[i] = 0;
+    }
+
+    m_num_tiles = 1;
+    m_prod_tile_dims = 1;
+
+    // Host
+    if ( true
+       #if defined(KOKKOS_ENABLE_CUDA)
+         && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
+       #endif
+       #if defined(KOKKOS_ENABLE_ROCM)
+         && !std::is_same< typename traits::execution_space, Kokkos::Experimental::ROCm >::value
+       #endif
+       )
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = m_upper[i] - m_lower[i];
+        if ( m_tile[i] <= 0 ) {
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = (span == 0 ? 1 : span);
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
+      }
+    }
+    #if defined(KOKKOS_ENABLE_CUDA)
+    else // Cuda
+    {
+      index_type span;
+      int increment = 1;
+      int rank_start = 0;
+      int rank_end = rank;
+      if((int)inner_direction == (int)Right) {
+        increment = -1;
+        rank_start = rank-1;
+        rank_end = -1;
+      }
+      for (int i=rank_start; i!=rank_end; i+=increment) {
+        span = m_upper[i] - m_lower[i];
+        if ( m_tile[i] <= 0 ) {
+          // TODO: determine what is a good default tile size for cuda
+          // may be rank dependent
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
+          {
+            if ( m_prod_tile_dims < 256 ) {
+              m_tile[i] = 2;
+            } else {
+              m_tile[i] = 1;
+            }
+          }
+          else {
+            m_tile[i] = 16;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
+      }
+      if ( m_prod_tile_dims > 1024 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
+        printf(" Tile dimensions exceed Cuda limits\n");
+        Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+        //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+      }
+    }
+    #endif
+    #if defined(KOKKOS_ENABLE_ROCM)
+    else // ROCm
+    {
+      index_type span;
+      int increment = 1;
+      int rank_start = 0;
+      int rank_end = rank;
+      if((int)inner_direction == (int)Right) {
+        increment = -1;
+        rank_start = rank-1;
+        rank_end = -1;
+      }
+      for (int i=rank_start; i!=rank_end; i+=increment) {
+        span = m_upper[i] - m_lower[i];
+        if ( m_tile[i] <= 0 ) {
+          // TODO: determine what is a good default tile size for cuda
+          // may be rank dependent
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
+          {
+            if ( m_prod_tile_dims < 256 ) {
+              m_tile[i] = 2;
+            } else {
+              m_tile[i] = 1;
+            }
+          }
+          else {
+            m_tile[i] = 16;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
+      }
+      if ( m_prod_tile_dims > 1024 ) { // Match ROCm restriction for ParallelReduce; 1024,1024,1024 max per dim , but product num_threads < 1024
+        printf(" Tile dimensions exceed ROCm limits\n");
+        Kokkos::abort(" ROCm ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+        //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+      }
+    }
+    #endif
+  }
+
+};
+
+} // namespace Kokkos
+
+// For backward compatibility
+namespace Kokkos { namespace Experimental {
+  using Kokkos::MDRangePolicy;
+  using Kokkos::Rank;
+  using Kokkos::Iterate;
+} } // end Kokkos::Experimental
+// ------------------------------------------------------------------ //
+
+// ------------------------------------------------------------------ //
+//md_parallel_for - deprecated use parallel_for
+// ------------------------------------------------------------------ //
+
+namespace Kokkos { namespace Experimental {
+
+template <typename MDRange, typename Functor, typename Enable = void>
+void md_parallel_for( MDRange const& range
+                    , Functor const& f
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      #if defined( KOKKOS_ENABLE_ROCM)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
+
+  using range_policy = typename MDRange::impl_range_policy;
+
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
+
+template <typename MDRange, typename Functor>
+void md_parallel_for( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      #if defined( KOKKOS_ENABLE_ROCM)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
+
+  using range_policy = typename MDRange::impl_range_policy;
+
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
+
+// Cuda specialization
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+template <typename MDRange, typename Functor>
+void md_parallel_for( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  closure.execute();
+}
+
+template <typename MDRange, typename Functor>
+void md_parallel_for( MDRange const& range
+                    , Functor const& f
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  closure.execute();
+}
+#endif
+// ------------------------------------------------------------------ //
+
+// ------------------------------------------------------------------ //
+//md_parallel_reduce - deprecated use parallel_reduce
+// ------------------------------------------------------------------ //
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      #if defined( KOKKOS_ENABLE_ROCM)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
+
+  using range_policy = typename MDRange::impl_range_policy;
+  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
+}
+
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      #if defined( KOKKOS_ENABLE_ROCM)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
+
+  using range_policy = typename MDRange::impl_range_policy;
+
+  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
+}
+
+// Cuda - md_parallel_reduce not implemented - use parallel_reduce
+
+} } // namespace Kokkos::Experimental
+
+#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+
diff --git a/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c34515899663a3ff009b3ca23f4ece0ed4e03441
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_AnonymousSpace.hpp
@@ -0,0 +1,127 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ANONYMOUSSPACE_HPP
+#define KOKKOS_ANONYMOUSSPACE_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Concepts.hpp>
+#include <cstddef>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+class AnonymousSpace {
+public:
+  //! Tag this class as a kokkos memory space
+  typedef AnonymousSpace  memory_space;
+  typedef Kokkos::DefaultExecutionSpace execution_space;
+  typedef size_t     size_type;
+
+  //! This memory space preferred device_type
+  typedef Kokkos::Device< execution_space, memory_space > device_type;
+
+  /**\brief  Default memory space instance */
+  AnonymousSpace() = default;
+  AnonymousSpace( AnonymousSpace && rhs ) = default;
+  AnonymousSpace( const AnonymousSpace & rhs ) = default;
+  AnonymousSpace & operator = ( AnonymousSpace && ) = default;
+  AnonymousSpace & operator = ( const AnonymousSpace & ) = default;
+  ~AnonymousSpace() = default;
+
+  /**\brief Return Name of the MemorySpace */
+  static constexpr const char* name() { return "Anonymous"; }
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<typename OtherSpace>
+struct MemorySpaceAccess< Kokkos::AnonymousSpace , OtherSpace > {
+  enum { assignable = true };
+  enum { accessible = true };
+  enum { deepcopy   = true };
+};
+
+template<typename OtherSpace>
+struct MemorySpaceAccess<  OtherSpace, Kokkos::AnonymousSpace > {
+  enum { assignable = true };
+  enum { accessible = true };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess<  Kokkos::AnonymousSpace, Kokkos::AnonymousSpace > {
+  enum { assignable = true };
+  enum { accessible = true };
+  enum { deepcopy   = true };
+};
+
+template<typename OtherSpace>
+struct VerifyExecutionCanAccessMemorySpace<OtherSpace, Kokkos::AnonymousSpace>
+{
+  enum {value = 1};
+  KOKKOS_INLINE_FUNCTION static void verify(void) {}
+  KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
+};
+
+template<typename OtherSpace>
+struct VerifyExecutionCanAccessMemorySpace<Kokkos::AnonymousSpace, OtherSpace>
+{
+  enum {value = 1};
+  KOKKOS_INLINE_FUNCTION static void verify(void) {}
+  KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#endif // #define KOKKOS_ANONYMOUSSPACE_HPP
+
diff --git a/packages/kokkos/core/src/Kokkos_Array.hpp b/packages/kokkos/core/src/Kokkos_Array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2708c4ffdb765982757d3c83eefc3528cc09f0ac
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Array.hpp
@@ -0,0 +1,315 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ARRAY_HPP
+#define KOKKOS_ARRAY_HPP
+
+#include <type_traits>
+#include <algorithm>
+#include <limits>
+#include <cstddef>
+
+namespace Kokkos {
+
+/**\brief  Derived from the C++17 'std::array'.
+ *         Dropping the iterator interface.
+ */
+template< class T      = void
+        , size_t N     = ~size_t(0)
+        , class Proxy  = void
+        >
+struct Array {
+public:
+  /**
+   * The elements of this C array shall not be accessed directly. The data
+   * member has to be declared public to enable aggregate initialization as for
+   * std::array. We mark it as private in the documentation.
+   * @private
+   */
+  T m_internal_implementation_private_member_data[N];
+public:
+
+  typedef T &                                 reference ;
+  typedef typename std::add_const<T>::type &  const_reference ;
+  typedef size_t                              size_type ;
+  typedef ptrdiff_t                           difference_type ;
+  typedef T                                   value_type ;
+  typedef T *                                 pointer ;
+  typedef typename std::add_const<T>::type *  const_pointer ;
+
+  KOKKOS_INLINE_FUNCTION static constexpr size_type size() { return N ; }
+  KOKKOS_INLINE_FUNCTION static constexpr bool      empty(){ return false ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  reference operator[]( const iType & i )
+    {
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_internal_implementation_private_member_data[i];
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  const_reference operator[]( const iType & i ) const
+    {
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_internal_implementation_private_member_data[i];
+    }
+
+  KOKKOS_INLINE_FUNCTION pointer       data()
+    {
+      return & m_internal_implementation_private_member_data[0];
+    }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const
+    {
+      return & m_internal_implementation_private_member_data[0];
+    }
+
+  // Do not default unless move and move-assignment are also defined
+  // ~Array() = default ;
+  // Array() = default ;
+  // Array( const Array & ) = default ;
+  // Array & operator = ( const Array & ) = default ;
+
+  // Some supported compilers are not sufficiently C++11 compliant
+  // for default move constructor and move assignment operator.
+  // Array( Array && ) = default ;
+  // Array & operator = ( Array && ) = default ;
+};
+
+
+template< class T , class Proxy >
+struct Array<T,0,Proxy> {
+public:
+
+  typedef typename std::add_const<T>::type &  reference ;
+  typedef typename std::add_const<T>::type &  const_reference ;
+  typedef size_t                              size_type ;
+  typedef ptrdiff_t                           difference_type ;
+  typedef typename std::add_const<T>::type    value_type ;
+  typedef typename std::add_const<T>::type *  pointer ;
+  typedef typename std::add_const<T>::type *  const_pointer ;
+
+  KOKKOS_INLINE_FUNCTION static constexpr size_type size()  { return 0 ; }
+  KOKKOS_INLINE_FUNCTION static constexpr bool      empty() { return true ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  value_type operator[]( const iType & )
+    {
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
+      return value_type();
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  value_type operator[]( const iType & ) const
+    {
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
+      return value_type();
+    }
+
+  KOKKOS_INLINE_FUNCTION pointer       data()       { return pointer(0) ; }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const { return const_pointer(0); }
+
+  KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
+  KOKKOS_FUNCTION_DEFAULTED Array() = default ;
+  KOKKOS_FUNCTION_DEFAULTED Array( const Array & ) = default ;
+  KOKKOS_FUNCTION_DEFAULTED Array & operator = ( const Array & ) = default ;
+
+  // Some supported compilers are not sufficiently C++11 compliant
+  // for default move constructor and move assignment operator.
+  // Array( Array && ) = default ;
+  // Array & operator = ( Array && ) = default ;
+};
+
+
+template<>
+struct Array<void,~size_t(0),void>
+{
+  struct contiguous {};
+  struct strided {};
+};
+
+template< class T >
+struct Array< T , ~size_t(0) , Array<>::contiguous >
+{
+private:
+  T *    m_elem ;
+  size_t m_size ;
+public:
+
+  typedef T &                                 reference ;
+  typedef typename std::add_const<T>::type &  const_reference ;
+  typedef size_t                              size_type ;
+  typedef ptrdiff_t                           difference_type ;
+  typedef T                                   value_type ;
+  typedef T *                                 pointer ;
+  typedef typename std::add_const<T>::type *  const_pointer ;
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type size()  const { return m_size ; }
+  KOKKOS_INLINE_FUNCTION constexpr bool      empty() const { return 0 != m_size ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  reference operator[]( const iType & i )
+    {
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_elem[i];
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  const_reference operator[]( const iType & i ) const
+    {
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_elem[i];
+    }
+
+  KOKKOS_INLINE_FUNCTION pointer       data()       { return m_elem ; }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }
+
+  KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
+  Array() = delete ;
+  Array( const Array & rhs ) = delete ;
+
+  // Some supported compilers are not sufficiently C++11 compliant
+  // for default move constructor and move assignment operator.
+  // Array( Array && rhs ) = default ;
+  // Array & operator = ( Array && rhs ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array & rhs )
+    {
+      const size_t n = std::min( m_size , rhs.size() );
+      for ( size_t i = 0 ; i < n ; ++i ) m_elem[i] = rhs[i] ;
+      return *this ;
+    }
+
+  template< size_t N , class P >
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array<T,N,P> & rhs )
+    {
+      const size_t n = std::min( m_size , rhs.size() );
+      for ( size_t i = 0 ; i < n ; ++i ) m_elem[i] = rhs[i] ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr Array( pointer arg_ptr , size_type arg_size , size_type = 0 )
+    : m_elem(arg_ptr), m_size(arg_size) {}
+};
+
+template< class T >
+struct Array< T , ~size_t(0) , Array<>::strided >
+{
+private:
+  T *    m_elem ;
+  size_t m_size ;
+  size_t m_stride ;
+public:
+
+  typedef T &                                 reference ;
+  typedef typename std::add_const<T>::type &  const_reference ;
+  typedef size_t                              size_type ;
+  typedef ptrdiff_t                           difference_type ;
+  typedef T                                   value_type ;
+  typedef T *                                 pointer ;
+  typedef typename std::add_const<T>::type *  const_pointer ;
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type size()  const { return m_size ; }
+  KOKKOS_INLINE_FUNCTION constexpr bool      empty() const { return 0 != m_size ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  reference operator[]( const iType & i )
+    {
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_elem[i*m_stride];
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  const_reference operator[]( const iType & i ) const
+    {
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_elem[i*m_stride];
+    }
+
+  KOKKOS_INLINE_FUNCTION pointer       data()       { return m_elem ; }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }
+
+  KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
+  Array()  = delete ;
+  Array( const Array & ) = delete ;
+
+
+  // Some supported compilers are not sufficiently C++11 compliant
+  // for default move constructor and move assignment operator.
+  // Array( Array && rhs ) = default ;
+  // Array & operator = ( Array && rhs ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array & rhs )
+    {
+      const size_t n = std::min( m_size , rhs.size() );
+      for ( size_t i = 0 ; i < n ; ++i ) m_elem[i] = rhs[i] ;
+      return *this ;
+    }
+
+  template< size_t N , class P >
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array<T,N,P> & rhs )
+    {
+      const size_t n = std::min( m_size , rhs.size() );
+      for ( size_t i = 0 ; i < n ; ++i ) m_elem[i] = rhs[i] ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr Array( pointer arg_ptr , size_type arg_size , size_type arg_stride )
+    : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {}
+};
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_ARRAY_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_Atomic.hpp b/packages/kokkos/core/src/Kokkos_Atomic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9f26ce228f01a3361f19be3698ac81440a13d39
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Atomic.hpp
@@ -0,0 +1,308 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Atomic.hpp
+/// \brief Atomic functions
+///
+/// This header file defines prototypes for the following atomic functions:
+///   - exchange
+///   - compare and exchange
+///   - add
+///
+/// Supported types include:
+///   - signed and unsigned 4 and 8 byte integers
+///   - float
+///   - double
+///
+/// They are implemented through GCC compatible intrinsics, OpenMP
+/// directives and native CUDA intrinsics.
+///
+/// Including this header file requires one of the following
+/// compilers:
+///   - NVCC (for CUDA device code only)
+///   - GCC (for host code only)
+///   - Intel (for host code only)
+///   - A compiler that supports OpenMP 3.1 (for host code only)
+
+#ifndef KOKKOS_ATOMIC_HPP
+#define KOKKOS_ATOMIC_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_Traits.hpp>
+
+//----------------------------------------------------------------------------
+#if defined(_WIN32)
+#define KOKKOS_ENABLE_WINDOWS_ATOMICS
+#else
+#if defined( KOKKOS_ENABLE_CUDA )
+
+// Compiling NVIDIA device code, must use Cuda atomics:
+
+#define KOKKOS_ENABLE_CUDA_ATOMICS
+
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU)
+
+#define KOKKOS_ENABLE_ROCM_ATOMICS
+
+#endif
+
+#if ! defined( KOKKOS_ENABLE_GNU_ATOMICS ) && \
+    ! defined( KOKKOS_ENABLE_INTEL_ATOMICS ) && \
+    ! defined( KOKKOS_ENABLE_OPENMP_ATOMICS ) && \
+    ! defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+
+// Compiling for non-Cuda atomic implementation has not been pre-selected.
+// Choose the best implementation for the detected compiler.
+// Preference: GCC, INTEL, OMP31
+
+#if defined( KOKKOS_INTERNAL_NOT_PARALLEL )
+
+#define KOKKOS_ENABLE_SERIAL_ATOMICS
+
+#elif defined( KOKKOS_COMPILER_GNU ) || \
+    defined( KOKKOS_COMPILER_CLANG ) || \
+    ( defined ( KOKKOS_COMPILER_NVCC ) )
+
+#define KOKKOS_ENABLE_GNU_ATOMICS
+
+#elif defined( KOKKOS_COMPILER_INTEL ) || \
+      defined( KOKKOS_COMPILER_CRAYC )
+
+#define KOKKOS_ENABLE_INTEL_ATOMICS
+
+#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
+
+#define KOKKOS_ENABLE_OPENMP_ATOMICS
+
+#else
+
+#error "KOKKOS_ATOMICS_USE : Unsupported compiler"
+
+#endif
+
+#endif /* Not pre-selected atomic implementation */
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
+#endif
+
+namespace Kokkos {
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T * const dest, const T src);
+
+// Atomic increment
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* a);
+
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* a);
+}
+
+namespace Kokkos {
+
+
+inline
+const char * atomic_query_version()
+{
+#if defined( KOKKOS_ENABLE_CUDA_ATOMICS )
+  return "KOKKOS_ENABLE_CUDA_ATOMICS" ;
+#elif defined( KOKKOS_ENABLE_GNU_ATOMICS )
+  return "KOKKOS_ENABLE_GNU_ATOMICS" ;
+#elif defined( KOKKOS_ENABLE_INTEL_ATOMICS )
+  return "KOKKOS_ENABLE_INTEL_ATOMICS" ;
+#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
+  return "KOKKOS_ENABLE_OPENMP_ATOMICS" ;
+#elif defined( KOKKOS_ENABLE_WINDOWS_ATOMICS )
+  return "KOKKOS_ENABLE_WINDOWS_ATOMICS";
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+  return "KOKKOS_ENABLE_SERIAL_ATOMICS";
+#else
+#error "No valid response for atomic_query_version!"
+#endif
+}
+
+} // namespace Kokkos
+
+#if defined( KOKKOS_ENABLE_ROCM )
+#include <ROCm/Kokkos_ROCm_Atomic.hpp>
+namespace Kokkos {
+namespace Impl {
+extern KOKKOS_INLINE_FUNCTION
+bool lock_address_rocm_space(void* ptr);
+
+extern KOKKOS_INLINE_FUNCTION
+void unlock_address_rocm_space(void* ptr);
+}
+}
+#endif
+
+#ifdef _WIN32
+#include "impl/Kokkos_Atomic_Windows.hpp"
+#else
+
+//----------------------------------------------------------------------------
+// Atomic Assembly
+//
+// Implements CAS128-bit in assembly
+
+#include "impl/Kokkos_Atomic_Assembly.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic exchange
+//
+// template< typename T >
+// T atomic_exchange( volatile T* const dest , const T val )
+// { T tmp = *dest ; *dest = val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Exchange.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic compare-and-exchange
+//
+// template<class T>
+// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+// { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } return equal ; }
+
+#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and add
+//
+// template<class T>
+// T atomic_fetch_add(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest += val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Add.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic increment
+//
+// template<class T>
+// T atomic_increment(volatile T* const dest)
+// { dest++; }
+
+#include "impl/Kokkos_Atomic_Increment.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic Decrement
+//
+// template<class T>
+// T atomic_decrement(volatile T* const dest)
+// { dest--; }
+
+#include "impl/Kokkos_Atomic_Decrement.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and sub
+//
+// template<class T>
+// T atomic_fetch_sub(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest -= val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Sub.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and or
+//
+// template<class T>
+// T atomic_fetch_or(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp | val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Or.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and and
+//
+// template<class T>
+// T atomic_fetch_and(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp & val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_And.hpp"
+#endif /*Not _WIN32*/
+
+//----------------------------------------------------------------------------
+// Memory fence
+//
+// All loads and stores from this thread will be globally consistent before continuing
+//
+// void memory_fence() {...};
+#include "impl/Kokkos_Memory_Fence.hpp"
+
+//----------------------------------------------------------------------------
+// Provide volatile_load and safe_load
+//
+// T volatile_load(T const volatile * const ptr);
+//
+// T const& safe_load(T const * const ptr);
+// XEON PHI
+// T safe_load(T const * const ptr
+
+#include "impl/Kokkos_Volatile_Load.hpp"
+
+#ifndef _WIN32
+#include "impl/Kokkos_Atomic_Generic.hpp"
+#endif
+//----------------------------------------------------------------------------
+// This atomic-style macro should be an inlined function, not a macro
+
+#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__) && !defined(__CUDA_ARCH__)
+
+  #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0)
+  #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0)
+
+#else
+
+  #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
+  #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_ATOMIC_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_Complex.hpp b/packages/kokkos/core/src/Kokkos_Complex.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ca85a304170174ce950b58894b36e749202f4e6
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Complex.hpp
@@ -0,0 +1,828 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_COMPLEX_HPP
+#define KOKKOS_COMPLEX_HPP
+
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_NumericTraits.hpp>
+#include <complex>
+#include <iostream>
+
+namespace Kokkos {
+
+/// \class complex
+/// \brief Partial reimplementation of std::complex that works as the
+///   result of a Kokkos::parallel_reduce.
+/// \tparam RealType The type of the real and imaginary parts of the
+///   complex number.  As with std::complex, this is only defined for
+///   \c float, \c double, and <tt>long double</tt>.  The latter is
+///   currently forbidden in CUDA device kernels.
+template<class RealType>
+class complex {
+private:
+  RealType re_, im_;
+
+public:
+  //! The type of the real or imaginary parts of this complex number.
+  typedef RealType value_type;
+
+  //! Default constructor (initializes both real and imaginary parts to zero).
+  KOKKOS_INLINE_FUNCTION complex () :
+    re_ (0.0), im_ (0.0)
+  {}
+
+  //! Copy constructor.
+  KOKKOS_INLINE_FUNCTION complex (const complex<RealType>& src) :
+    re_ (src.re_), im_ (src.im_)
+  {}
+
+  //! Copy constructor from volatile.
+  KOKKOS_INLINE_FUNCTION complex (const volatile complex<RealType>& src) :
+    re_ (src.re_), im_ (src.im_)
+  {}
+
+  /// \brief Conversion constructor from std::complex.
+  ///
+  /// This constructor cannot be called in a CUDA device function,
+  /// because std::complex's methods and nonmember functions are not
+  /// marked as CUDA device functions.
+  template<class InputRealType>
+  complex (const std::complex<InputRealType>& src) :
+    re_ (std::real (src)), im_ (std::imag (src))
+  {}
+
+  /// \brief Conversion operator to std::complex.
+  ///
+  /// This operator cannot be called in a CUDA device function,
+  /// because std::complex's methods and nonmember functions are not
+  /// marked as CUDA device functions.
+  operator std::complex<RealType> () const {
+    return std::complex<RealType> (re_, im_);
+  }
+
+  /// \brief Constructor that takes just the real part, and sets the
+  ///   imaginary part to zero.
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION complex (const InputRealType& val) :
+    re_ (val), im_ (static_cast<InputRealType>(0.0))
+  {}
+
+  // BUG HCC WORKAROUND
+  KOKKOS_INLINE_FUNCTION complex( const RealType& re, const RealType& im):
+    re_ (re), im_ (im)
+  {}
+ 
+  //! Constructor that takes the real and imaginary parts.
+  template<class RealType1, class RealType2>
+  KOKKOS_INLINE_FUNCTION complex (const RealType1& re, const RealType2& im) :
+    re_ (re), im_ (im)
+  {}
+
+  //! Assignment operator.
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator= (const complex<InputRealType>& src) {
+    re_ = src.re_;
+    im_ = src.im_;
+    return *this;
+  }
+
+  /// \brief Assignment operator, for volatile <tt>*this</tt> and
+  ///   nonvolatile input.
+  ///
+  /// \param src [in] Input; right-hand side of the assignment.
+  ///
+  /// This operator returns \c void instead of <tt>volatile
+  /// complex<RealType>& </tt>.  See Kokkos Issue #177 for the
+  /// explanation.  In practice, this means that you should not chain
+  /// assignments with volatile lvalues.
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  void operator= (const complex<InputRealType>& src) volatile {
+    re_ = src.re_;
+    im_ = src.im_;
+    // We deliberately do not return anything here.  See explanation
+    // in public documentation above.
+  }
+
+  //! Assignment operator.
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  volatile complex<RealType>& operator= (const volatile complex<InputRealType>& src) volatile {
+    re_ = src.re_;
+    im_ = src.im_;
+    return *this;
+  }
+
+  //! Assignment operator.
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator= (const volatile complex<InputRealType>& src) {
+    re_ = src.re_;
+    im_ = src.im_;
+    return *this;
+  }
+
+  //! Assignment operator (from a real number).
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator= (const InputRealType& val) {
+    re_ = val;
+    im_ = static_cast<RealType> (0.0);
+    return *this;
+  }
+
+  //! Assignment operator (from a real number).
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  void operator= (const InputRealType& val) volatile {
+    re_ = val;
+    im_ = static_cast<RealType> (0.0);
+  }
+
+  /// \brief Assignment operator from std::complex.
+  ///
+  /// This constructor cannot be called in a CUDA device function,
+  /// because std::complex's methods and nonmember functions are not
+  /// marked as CUDA device functions.
+  template<class InputRealType>
+  complex<RealType>& operator= (const std::complex<InputRealType>& src) {
+    re_ = std::real (src);
+    im_ = std::imag (src);
+    return *this;
+  }
+
+  //! The imaginary part of this complex number.
+  KOKKOS_INLINE_FUNCTION RealType& imag () {
+    return im_;
+  }
+
+  //! The real part of this complex number.
+  KOKKOS_INLINE_FUNCTION RealType& real () {
+    return re_;
+  }
+
+  //! The imaginary part of this complex number.
+  KOKKOS_INLINE_FUNCTION const RealType imag () const {
+    return im_;
+  }
+
+  //! The real part of this complex number.
+  KOKKOS_INLINE_FUNCTION const RealType real () const {
+    return re_;
+  }
+
+  //! The imaginary part of this complex number (volatile overload).
+  KOKKOS_INLINE_FUNCTION volatile RealType& imag () volatile {
+    return im_;
+  }
+
+  //! The real part of this complex number (volatile overload).
+  KOKKOS_INLINE_FUNCTION volatile RealType& real () volatile {
+    return re_;
+  }
+
+  //! The imaginary part of this complex number (volatile overload).
+  KOKKOS_INLINE_FUNCTION const RealType imag () const volatile {
+    return im_;
+  }
+
+  //! The real part of this complex number (volatile overload).
+  KOKKOS_INLINE_FUNCTION const RealType real () const volatile {
+    return re_;
+  }
+
+  //! Set the imaginary part of this complex number.
+  KOKKOS_INLINE_FUNCTION void imag (RealType v) {
+    im_ = v;
+  }
+
+  //! Set the real part of this complex number.
+  KOKKOS_INLINE_FUNCTION void real (RealType v) {
+    re_ = v;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator += (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+    re_ += src.re_;
+    im_ += src.im_;
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  void
+  operator += (const volatile complex<InputRealType>& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+    re_ += src.re_;
+    im_ += src.im_;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator += (const std::complex<RealType>& src) {
+    re_ += src.real();
+    im_ += src.imag();
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator += (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+    re_ += src;
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  void
+  operator += (const volatile InputRealType& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+    re_ += src;
+  }
+  
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator -= (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+    re_ -= src.re_;
+    im_ -= src.im_;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator -= (const std::complex<RealType>& src) {
+    re_ -= src.real();
+    im_ -= src.imag();
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator -= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+    re_ -= src;
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator *= (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+    const RealType realPart = re_ * src.re_ - im_ * src.im_;
+    const RealType imagPart = re_ * src.im_ + im_ * src.re_;
+    re_ = realPart;
+    im_ = imagPart;
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  void
+  operator *= (const volatile complex<InputRealType>& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+    const RealType realPart = re_ * src.re_ - im_ * src.im_;
+    const RealType imagPart = re_ * src.im_ + im_ * src.re_;
+    re_ = realPart;
+    im_ = imagPart;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator *= (const std::complex<RealType>& src) {
+    const RealType realPart = re_ * src.real() - im_ * src.imag();
+    const RealType imagPart = re_ * src.imag() + im_ * src.real();
+    re_ = realPart;
+    im_ = imagPart;
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator *= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+    re_ *= src;
+    im_ *= src;
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  void
+  operator *= (const volatile InputRealType& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+    re_ *= src;
+    im_ *= src;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator /= (const complex<InputRealType>& y) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
+    // If the real part is +/-Inf and the imaginary part is -/+Inf,
+    // this won't change the result.
+    const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
+
+    // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
+    // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
+    // because y/s is NaN.
+    if (s == 0.0) {
+      this->re_ /= s;
+      this->im_ /= s;
+    }
+    else {
+      const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
+      const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
+      const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
+        y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
+      *this = x_scaled * y_conj_scaled;
+      *this /= y_scaled_abs;
+    }
+    return *this;
+  }
+  
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator /= (const std::complex<RealType>& y) {
+
+    // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
+    // If the real part is +/-Inf and the imaginary part is -/+Inf,
+    // this won't change the result.
+    const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
+
+    // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
+    // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
+    // because y/s is NaN.
+    if (s == 0.0) {
+      this->re_ /= s;
+      this->im_ /= s;
+    }
+    else {
+      const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
+      const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
+      const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
+        y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
+      *this = x_scaled * y_conj_scaled;
+      *this /= y_scaled_abs;
+    }
+    return *this;
+  }
+
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator /= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    re_ /= src;
+    im_ /= src;
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator == (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ == static_cast<RealType>(src.re_)) && (im_ == static_cast<RealType>(src.im_));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator == (const std::complex<RealType>& src) {
+    return (re_ == src.real()) && (im_ == src.imag());
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator == (const InputRealType src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ == static_cast<RealType>(src)) && (im_ == RealType(0));
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator != (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ != static_cast<RealType>(src.re_)) || (im_ != static_cast<RealType>(src.im_));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator != (const std::complex<RealType>& src) {
+    return (re_ != src.real()) || (im_ != src.imag());
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator != (const InputRealType src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ != static_cast<RealType>(src)) || (im_ != RealType(0));
+  }
+  
+};
+
+//! Binary + operator for complex complex.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type > (x.real () + y.real (), x.imag () + y.imag ());
+}
+
+//! Binary + operator for complex scalar.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const complex<RealType1>& x, const RealType2& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () + y , x.imag ());
+}
+
+//! Binary + operator for scalar complex.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x + y.real (), y.imag ());
+}
+
+//! Unary + operator for complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator + (const complex<RealType>& x) {
+  return x;
+}
+
+//! Binary - operator for complex.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y.real (), x.imag () - y.imag ());
+}
+
+//! Binary - operator for complex scalar.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const complex<RealType1>& x, const RealType2& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y , x.imag ());
+}
+
+//! Binary - operator for scalar complex.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x - y.real (), - y.imag ());
+}
+
+//! Unary - operator for complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator - (const complex<RealType>& x) {
+  return complex<RealType> (-x.real (), -x.imag ());
+}
+
+//! Binary * operator for complex.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
+                                                                        x.real () * y.imag () + x.imag () * y.real ());
+}
+
+/// \brief Binary * operator for std::complex and complex.
+///
+/// This function exists because GCC 4.7.2 (and perhaps other
+/// compilers) are not able to deduce that they can multiply
+/// std::complex by Kokkos::complex, by first converting std::complex
+/// to Kokkos::complex.
+///
+/// This function cannot be called in a CUDA device function, because
+/// std::complex's methods and nonmember functions are not marked as
+/// CUDA device functions.
+template<class RealType1, class RealType2>
+inline
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
+                                                                        x.real () * y.imag () + x.imag () * y.real ());
+}
+
+/// \brief Binary * operator for RealType times complex.
+///
+/// This function exists because the compiler doesn't know that
+/// RealType and complex<RealType> commute with respect to operator*.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
+}
+
+/// \brief Binary * operator for RealType times complex.
+///
+/// This function exists because the compiler doesn't know that
+/// RealType and complex<RealType> commute with respect to operator*.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const complex<RealType1>& y, const RealType2& x) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
+}
+
+//! Imaginary part of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+RealType imag (const complex<RealType>& x) {
+  return x.imag ();
+}
+
+//! Real part of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+RealType real (const complex<RealType>& x) {
+  return x.real ();
+}
+
+//! Absolute value (magnitude) of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+RealType abs (const complex<RealType>& x) {
+  // FIXME (mfh 31 Oct 2014) Scale to avoid unwarranted overflow.
+  return std::sqrt (real (x) * real (x) + imag (x) * imag (x));
+}
+
+//! Power of a complex number
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+Kokkos::complex<RealType> pow (const complex<RealType>& x, const RealType& e) {
+  RealType r = abs(x);
+  RealType phi = std::atan(x.imag()/x.real());
+  return std::pow(r,e) * Kokkos::complex<RealType>(std::cos(phi*e),std::sin(phi*e)); 
+}
+
+//! Square root of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+Kokkos::complex<RealType> sqrt (const complex<RealType>& x) {
+  RealType r = abs(x);
+  RealType phi = std::atan(x.imag()/x.real());
+  return std::sqrt(r) * Kokkos::complex<RealType>(std::cos(phi*0.5),std::sin(phi*0.5));
+}
+
+//! Conjugate of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType> conj (const complex<RealType>& x) {
+  return complex<RealType> (real (x), -imag (x));
+}
+
+//! Exponential of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType> exp (const complex<RealType>& x) {
+  return std::exp(x.real()) * complex<RealType> (std::cos (x.imag()),  std::sin(x.imag()));
+}
+
+//! Exponential of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType> pow (const complex<RealType>& x) {
+  return std::exp(x.real()) * complex<RealType> (std::cos (x.imag()),  std::sin(x.imag()));
+}
+
+//! Binary operator / for complex and real numbers
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator / (const complex<RealType1>& x, const RealType2& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (real (x) / y, imag (x) / y);
+}
+
+//! Binary operator / for complex.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator / (const complex<RealType1>& x, const complex<RealType2>& y) {
+  // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
+  // If the real part is +/-Inf and the imaginary part is -/+Inf,
+  // this won't change the result.
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y));
+
+  // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
+  // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
+  // because y/s is NaN.
+  if (s == 0.0) {
+    return complex<common_real_type> (real (x) / s, imag (x) / s);
+  }
+  else {
+    const complex<common_real_type> x_scaled (real (x) / s, imag (x) / s);
+    const complex<common_real_type> y_conj_scaled (real (y) / s, -imag (y) / s);
+    const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
+      imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
+    complex<common_real_type> result = x_scaled * y_conj_scaled;
+    result /= y_scaled_abs;
+    return result;
+  }
+}
+
+//! Binary operator / for complex and real numbers
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator / (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x)/y;
+}
+
+//! Equality operator for two complex numbers.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+bool
+operator == (const complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(real (y)) && 
+           static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(imag (y)) );
+}
+
+/// \brief Equality operator for std::complex and Kokkos::complex.
+///
+/// This cannot be a device function, since std::real is not.
+/// Otherwise, CUDA builds will give compiler warnings ("warning:
+/// calling a constexpr __host__ function("real") from a __host__
+/// __device__ function("operator==") is not allowed").
+template<class RealType1, class RealType2>
+inline
+bool
+operator == (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(std::real (x)) == static_cast<common_real_type>(real (y)) && 
+           static_cast<common_real_type>(std::imag (x)) == static_cast<common_real_type>(imag (y)) );
+}
+  
+//! Equality operator for complex and real number.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+bool
+operator == (const complex<RealType1>& x, const RealType2& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(y) && 
+           static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(0.0) );
+}
+
+//! Equality operator for real and complex number.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+bool
+operator == (const RealType1& x, const complex<RealType2>& y) {
+  return y == x;
+}
+
+//! Inequality operator for two complex numbers.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+bool
+operator != (const complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(real (y)) || 
+           static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(imag (y)) );
+}
+
+//! Inequality operator for std::complex and Kokkos::complex.
+template<class RealType1, class RealType2>
+inline
+bool
+operator != (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(std::real (x)) != static_cast<common_real_type>(real (y)) || 
+           static_cast<common_real_type>(std::imag (x)) != static_cast<common_real_type>(imag (y)) );
+}
+
+//! Inequality operator for complex and real number.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+bool
+operator != (const complex<RealType1>& x, const RealType2& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(y) || 
+           static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(0.0) );
+}
+
+//! Inequality operator for real and complex number.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+bool
+operator != (const RealType1& x, const complex<RealType2>& y) {
+  return y != x;
+}
+
+template<class RealType>
+std::ostream& operator << (std::ostream& os, const complex<RealType>& x) {
+  const std::complex<RealType> x_std (Kokkos::real (x), Kokkos::imag (x));
+  os << x_std;
+  return os;
+}
+
+template<class RealType>
+std::ostream& operator >> (std::ostream& os, complex<RealType>& x) {
+  std::complex<RealType> x_std;
+  os >> x_std;
+  x = x_std; // only assigns on success of above
+  return os;
+}
+
+
+template<class T>
+struct reduction_identity<Kokkos::complex<T> > {
+  typedef reduction_identity<T> t_red_ident;
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> sum()
+      {return Kokkos::complex<T>(t_red_ident::sum(),t_red_ident::sum());}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> prod()
+      {return Kokkos::complex<T>(t_red_ident::prod(),t_red_ident::sum());}
+};
+
+} // namespace Kokkos
+
+#endif // KOKKOS_COMPLEX_HPP
diff --git a/packages/kokkos/core/src/Kokkos_Concepts.hpp b/packages/kokkos/core/src/Kokkos_Concepts.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e2507b27d557ea758c5df7b16716aa518f2bf09
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Concepts.hpp
@@ -0,0 +1,376 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_CONCEPTS_HPP
+#define KOKKOS_CORE_CONCEPTS_HPP
+
+#include <type_traits>
+
+// Needed for 'is_space<S>::host_mirror_space
+#include <Kokkos_Core_fwd.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//Schedules for Execution Policies
+struct Static {};
+struct Dynamic {};
+
+//Schedule Wrapper Type
+template<class T>
+struct Schedule
+{
+  static_assert(  std::is_same<T,Static>::value
+               || std::is_same<T,Dynamic>::value
+               , "Kokkos: Invalid Schedule<> type."
+               );
+  using schedule_type = Schedule ;
+  using type = T;
+};
+
+//Specify Iteration Index Type
+template<typename T>
+struct IndexType
+{
+  static_assert(std::is_integral<T>::value,"Kokkos: Invalid IndexType<>.");
+  using index_type = IndexType ;
+  using type = T;
+};
+
+/**\brief Specify Launch Bounds for CUDA execution.
+ *
+ *  If no launch bounds specified then do not set launch bounds.
+ */
+template< unsigned int maxT = 0 /* Max threads per block */
+        , unsigned int minB = 0 /* Min blocks per SM */
+        >
+struct LaunchBounds
+{
+  using launch_bounds = LaunchBounds;
+  using type = LaunchBounds<maxT,minB>;
+  static unsigned int constexpr maxTperB {maxT};
+  static unsigned int constexpr minBperSM {minB};
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+#define KOKKOS_IMPL_IS_CONCEPT( CONCEPT ) \
+  template< typename T > struct is_ ## CONCEPT { \
+  private: \
+    template< typename , typename = std::true_type > struct have : std::false_type {}; \
+    template< typename U > struct have<U,typename std::is_same<U,typename U:: CONCEPT >::type> : std::true_type {}; \
+  public: \
+    enum { value = is_ ## CONCEPT::template have<T>::value }; \
+  };
+
+// Public concept:
+
+KOKKOS_IMPL_IS_CONCEPT( memory_space )
+KOKKOS_IMPL_IS_CONCEPT( memory_traits )
+KOKKOS_IMPL_IS_CONCEPT( execution_space )
+KOKKOS_IMPL_IS_CONCEPT( execution_policy )
+KOKKOS_IMPL_IS_CONCEPT( array_layout )
+KOKKOS_IMPL_IS_CONCEPT( reducer )
+
+namespace Impl {
+
+// For backward compatibility:
+
+using Kokkos::is_memory_space ;
+using Kokkos::is_memory_traits ;
+using Kokkos::is_execution_space ;
+using Kokkos::is_execution_policy ;
+using Kokkos::is_array_layout ;
+
+// Implementation concept:
+
+KOKKOS_IMPL_IS_CONCEPT( iteration_pattern )
+KOKKOS_IMPL_IS_CONCEPT( schedule_type )
+KOKKOS_IMPL_IS_CONCEPT( index_type )
+KOKKOS_IMPL_IS_CONCEPT( launch_bounds )
+
+}
+
+#undef KOKKOS_IMPL_IS_CONCEPT
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class ExecutionSpace , class MemorySpace >
+struct Device {
+  static_assert( Kokkos::is_execution_space<ExecutionSpace>::value
+               , "Execution space is not valid" );
+  static_assert( Kokkos::is_memory_space<MemorySpace>::value
+               , "Memory space is not valid" );
+  typedef ExecutionSpace                        execution_space;
+  typedef MemorySpace                           memory_space;
+  typedef Device<execution_space,memory_space>  device_type;
+};
+
+
+template< typename T >
+struct is_space {
+private:
+
+  template< typename , typename = void >
+  struct exe : std::false_type { typedef void space ; };
+
+  template< typename , typename = void >
+  struct mem : std::false_type { typedef void space ; };
+
+  template< typename , typename = void >
+  struct dev : std::false_type { typedef void space ; };
+
+  template< typename U >
+  struct exe<U,typename std::conditional<true,void,typename U::execution_space>::type>
+    : std::is_same<U,typename U::execution_space>::type
+    { typedef typename U::execution_space space ; };
+
+  template< typename U >
+  struct mem<U,typename std::conditional<true,void,typename U::memory_space>::type>
+    : std::is_same<U,typename U::memory_space>::type
+    { typedef typename U::memory_space space ; };
+
+  template< typename U >
+  struct dev<U,typename std::conditional<true,void,typename U::device_type>::type>
+    : std::is_same<U,typename U::device_type>::type
+    { typedef typename U::device_type space ; };
+
+  typedef typename is_space::template exe<T> is_exe ;
+  typedef typename is_space::template mem<T> is_mem ;
+  typedef typename is_space::template dev<T> is_dev ;
+
+public:
+
+  enum { value = is_exe::value || is_mem::value || is_dev::value };
+
+  typedef typename is_exe::space execution_space ;
+  typedef typename is_mem::space memory_space ;
+
+  // For backward compatibility, deprecated in favor of
+  // Kokkos::Impl::HostMirror<S>::host_mirror_space
+
+  typedef typename std::conditional
+    < std::is_same< memory_space , Kokkos::HostSpace >::value
+#if defined( KOKKOS_ENABLE_CUDA )
+      || std::is_same< memory_space , Kokkos::CudaUVMSpace >::value
+      || std::is_same< memory_space , Kokkos::CudaHostPinnedSpace >::value
+#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
+    , memory_space
+    , Kokkos::HostSpace
+    >::type  host_memory_space ;
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  typedef typename std::conditional
+    < std::is_same< execution_space , Kokkos::Cuda >::value
+    , Kokkos::DefaultHostExecutionSpace , execution_space
+    >::type  host_execution_space ;
+#else
+  #if defined( KOKKOS_ENABLE_OPENMPTARGET )
+    typedef typename std::conditional
+      < std::is_same< execution_space , Kokkos::Experimental::OpenMPTarget >::value
+      , Kokkos::DefaultHostExecutionSpace , execution_space
+      >::type  host_execution_space ;
+  #else
+    typedef execution_space  host_execution_space ;
+  #endif
+#endif
+
+  typedef typename std::conditional
+    < std::is_same< execution_space , host_execution_space >::value &&
+      std::is_same< memory_space ,    host_memory_space    >::value
+    , T , Kokkos::Device< host_execution_space , host_memory_space >
+    >::type  host_mirror_space ;
+};
+
+// For backward compatiblity
+
+namespace Impl {
+
+using Kokkos::is_space ;
+
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/**\brief  Access relationship between DstMemorySpace and SrcMemorySpace
+ *
+ *  The default case can assume accessibility for the same space.
+ *  Specializations must be defined for different memory spaces.
+ */
+template< typename DstMemorySpace , typename SrcMemorySpace >
+struct MemorySpaceAccess {
+
+  static_assert( Kokkos::is_memory_space< DstMemorySpace >::value &&
+                 Kokkos::is_memory_space< SrcMemorySpace >::value
+               , "template arguments must be memory spaces" );
+
+  /**\brief  Can a View (or pointer) to memory in SrcMemorySpace
+   *         be assigned to a View (or pointer) to memory marked DstMemorySpace.
+   *
+   *  1. DstMemorySpace::execution_space == SrcMemorySpace::execution_space
+   *  2. All execution spaces that can access DstMemorySpace can also access
+   *     SrcMemorySpace.
+   */
+  enum { assignable = std::is_same<DstMemorySpace,SrcMemorySpace>::value };
+
+  /**\brief  For all DstExecSpace::memory_space == DstMemorySpace
+   *         DstExecSpace can access SrcMemorySpace.
+   */
+  enum { accessible = assignable };
+
+  /**\brief  Does a DeepCopy capability exist
+   *         to DstMemorySpace from SrcMemorySpace
+   */
+  enum { deepcopy = assignable };
+};
+
+}} // namespace Kokkos::Impl
+
+namespace Kokkos {
+
+/**\brief  Can AccessSpace access MemorySpace ?
+ *
+ *   Requires:
+ *     Kokkos::is_space< AccessSpace >::value
+ *     Kokkos::is_memory_space< MemorySpace >::value
+ *
+ *   Can AccessSpace::execution_space access MemorySpace ?
+ *     enum : bool { accessible };
+ *
+ *   Is View<AccessSpace::memory_space> assignable from View<MemorySpace> ?
+ *     enum : bool { assignable };
+ *
+ *   If ! accessible then through which intercessory memory space
+ *   should a be used to deep copy memory for
+ *     AccessSpace::execution_space
+ *   to get access.
+ *   When AccessSpace::memory_space == Kokkos::HostSpace
+ *   then space is the View host mirror space.
+ */
+template< typename AccessSpace , typename MemorySpace >
+struct SpaceAccessibility {
+private:
+
+  static_assert( Kokkos::is_space< AccessSpace >::value
+               , "template argument #1 must be a Kokkos space" );
+
+  static_assert( Kokkos::is_memory_space< MemorySpace >::value
+               , "template argument #2 must be a Kokkos memory space" );
+
+  // The input AccessSpace may be a Device<ExecSpace,MemSpace>
+  // verify that it is a valid combination of spaces.
+  static_assert( Kokkos::Impl::MemorySpaceAccess
+                   < typename AccessSpace::execution_space::memory_space
+                   , typename AccessSpace::memory_space
+                   >::accessible
+               , "template argument #1 is an invalid space" );
+
+  typedef Kokkos::Impl::MemorySpaceAccess
+    < typename AccessSpace::execution_space::memory_space , MemorySpace >
+      exe_access ;
+
+  typedef Kokkos::Impl::MemorySpaceAccess
+    < typename AccessSpace::memory_space , MemorySpace >
+      mem_access ;
+
+public:
+
+  /**\brief  Can AccessSpace::execution_space access MemorySpace ?
+   *
+   *  Default based upon memory space accessibility.
+   *  Specialization required for other relationships.
+   */
+  enum { accessible = exe_access::accessible };
+
+  /**\brief  Can assign to AccessSpace from MemorySpace ?
+   *
+   *  Default based upon memory space accessibility.
+   *  Specialization required for other relationships.
+   */
+  enum { assignable =
+    is_memory_space< AccessSpace >::value && mem_access::assignable };
+
+  /**\brief  Can deep copy to AccessSpace::memory_Space from MemorySpace ?  */
+  enum { deepcopy = mem_access::deepcopy };
+
+  // What intercessory space for AccessSpace::execution_space
+  // to be able to access MemorySpace?
+  // If same memory space or not accessible use the AccessSpace
+  // else construct a device with execution space and memory space.
+  typedef typename std::conditional
+    < std::is_same<typename AccessSpace::memory_space,MemorySpace>::value ||
+      ! exe_access::accessible
+    , AccessSpace
+    , Kokkos::Device< typename AccessSpace::execution_space , MemorySpace >
+    >::type  space ;
+};
+
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+
+using Kokkos::SpaceAccessibility ; // For backward compatibility
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+
+#endif // KOKKOS_CORE_CONCEPTS_HPP
+
diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..23789eb85c5d34347761869b90f6df72d9269f9d
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
@@ -0,0 +1,1720 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_COPYVIEWS_HPP_
+#define KOKKOS_COPYVIEWS_HPP_
+#include <string>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<class Layout>
+struct ViewFillLayoutSelector {
+};
+
+template<>
+struct ViewFillLayoutSelector<Kokkos::LayoutLeft> {
+  static const Kokkos::Iterate iterate = Kokkos::Iterate::Left;
+};
+
+template<>
+struct ViewFillLayoutSelector<Kokkos::LayoutRight> {
+  static const Kokkos::Iterate iterate = Kokkos::Iterate::Right;
+};
+
+template<class ViewType,class Layout, class ExecSpace,typename iType>
+struct ViewFill<ViewType,Layout,ExecSpace,0,iType> {
+
+  typedef typename ViewType::non_const_value_type ST;
+
+  ViewFill(const ViewType& a, const ST& val) {
+    Kokkos::Impl::DeepCopy< typename ViewType::memory_space, Kokkos::HostSpace >( a.data() , &val, sizeof(ST) );
+  }
+};
+
+
+template<class ViewType,class Layout, class ExecSpace,typename iType>
+struct ViewFill<ViewType,Layout,ExecSpace,1,iType> {
+  ViewType a;
+  typename ViewType::const_value_type val;
+  typedef Kokkos::RangePolicy<ExecSpace,Kokkos::IndexType<iType>> policy_type;
+
+  ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_):a(a_),val(val_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewFill-1D",policy_type(0,a.extent(0)),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i) const {
+    a(i) = val;
+  };
+};
+
+template<class ViewType,class Layout, class ExecSpace,typename iType>
+struct ViewFill<ViewType,Layout,ExecSpace,2,iType> {
+  ViewType a;
+  typename ViewType::const_value_type val;
+
+  typedef Kokkos::Rank<2,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_):a(a_),val(val_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewFill-2D",
+       policy_type({0,0},{a.extent(0),a.extent(1)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1) const {
+    a(i0,i1) = val;
+  };
+};
+
+template<class ViewType,class Layout, class ExecSpace,typename iType>
+struct ViewFill<ViewType,Layout,ExecSpace,3,iType> {
+  ViewType a;
+  typename ViewType::const_value_type val;
+
+  typedef Kokkos::Rank<3,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_):a(a_),val(val_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewFill-3D",
+       policy_type({0,0,0},{a.extent(0),a.extent(1),a.extent(2)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i2) const {
+    a(i0,i1,i2) = val;
+  };
+};
+
+template<class ViewType,class Layout, class ExecSpace,typename iType>
+struct ViewFill<ViewType,Layout,ExecSpace,4,iType> {
+  ViewType a;
+  typename ViewType::const_value_type val;
+
+  typedef Kokkos::Rank<4,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_):a(a_),val(val_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewFill-4D",
+       policy_type({0,0,0,0},{a.extent(0),a.extent(1),a.extent(2),a.extent(3)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i2, const iType& i3) const {
+    a(i0,i1,i2,i3) = val;
+  };
+};
+
+template<class ViewType,class Layout, class ExecSpace,typename iType>
+struct ViewFill<ViewType,Layout,ExecSpace,5,iType> {
+  ViewType a;
+  typename ViewType::const_value_type val;
+
+  typedef Kokkos::Rank<5,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_):a(a_),val(val_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewFill-5D",
+       policy_type({0,0,0,0,0},{a.extent(0),a.extent(1),a.extent(2),a.extent(3),a.extent(4)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i2, const iType& i3, const iType& i4) const {
+    a(i0,i1,i2,i3,i4) = val;
+  };
+};
+
+template<class ViewType,class Layout, class ExecSpace,typename iType>
+struct ViewFill<ViewType,Layout,ExecSpace,6,iType> {
+  ViewType a;
+  typename ViewType::const_value_type val;
+
+  typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_):a(a_),val(val_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewFill-6D",
+       policy_type({0,0,0,0,0,0},{a.extent(0),a.extent(1),a.extent(2),a.extent(3),a.extent(4),a.extent(5)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i2, const iType& i3, const iType& i4, const iType& i5) const {
+    a(i0,i1,i2,i3,i4,i5) = val;
+  };
+};
+
+template<class ViewType,class Layout, class ExecSpace,typename iType>
+struct ViewFill<ViewType,Layout,ExecSpace,7,iType> {
+  ViewType a;
+  typename ViewType::const_value_type val;
+
+  typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_):a(a_),val(val_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewFill-7D",
+       policy_type({0,0,0,0,0,0},{a.extent(0),a.extent(1),a.extent(2),a.extent(3),
+                                  a.extent(5),a.extent(6)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i3,
+                   const iType& i4, const iType& i5, const iType& i6) const {
+    for(iType i2=0; i2<iType(a.extent(2));i2++)
+      a(i0,i1,i2,i3,i4,i5,i6) = val;
+  };
+};
+
+template<class ViewType,class Layout, class ExecSpace,typename iType>
+struct ViewFill<ViewType,Layout,ExecSpace,8,iType> {
+  ViewType a;
+  typename ViewType::const_value_type val;
+
+  typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_):a(a_),val(val_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewFill-8D",
+       policy_type({0,0,0,0,0,0},{a.extent(0),a.extent(1),a.extent(3),
+                                  a.extent(5),a.extent(6),a.extent(7)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i3,
+                   const iType& i5, const iType& i6, const iType& i7) const {
+    for(iType i2=0; i2<iType(a.extent(2));i2++)
+    for(iType i4=0; i4<iType(a.extent(4));i4++)
+      a(i0,i1,i2,i3,i4,i5,i6,i7) = val;
+  };
+};
+
+template<class ViewTypeA,class ViewTypeB, class Layout, class ExecSpace,typename iType>
+struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,1,iType> {
+  ViewTypeA a;
+  ViewTypeB b;
+
+  typedef Kokkos::RangePolicy<ExecSpace,Kokkos::IndexType<iType>> policy_type;
+
+  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewCopy-2D",
+       policy_type(0,a.extent(0)),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0) const {
+      a(i0) = b(i0);
+  };
+};
+
+template<class ViewTypeA,class ViewTypeB, class Layout, class ExecSpace,typename iType>
+struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,2,iType> {
+  ViewTypeA a;
+  ViewTypeB b;
+
+  typedef Kokkos::Rank<2,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewCopy-2D",
+       policy_type({0,0},{a.extent(0),a.extent(1)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1) const {
+      a(i0,i1) = b(i0,i1);
+  };
+};
+
+template<class ViewTypeA,class ViewTypeB, class Layout, class ExecSpace,typename iType>
+struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,3,iType> {
+  ViewTypeA a;
+  ViewTypeB b;
+
+  typedef Kokkos::Rank<3,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewCopy-3D",
+       policy_type({0,0,0},{a.extent(0),a.extent(1),a.extent(2)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i2) const {
+      a(i0,i1,i2) = b(i0,i1,i2);
+  };
+};
+
+template<class ViewTypeA,class ViewTypeB, class Layout, class ExecSpace,typename iType>
+struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,4,iType> {
+  ViewTypeA a;
+  ViewTypeB b;
+
+  typedef Kokkos::Rank<4,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewCopy-4D",
+       policy_type({0,0,0,0},{a.extent(0),a.extent(1),a.extent(2),
+                              a.extent(3)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i2,
+                   const iType& i3) const {
+      a(i0,i1,i2,i3) = b(i0,i1,i2,i3);
+  };
+};
+
+template<class ViewTypeA,class ViewTypeB, class Layout, class ExecSpace,typename iType>
+struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,5,iType> {
+  ViewTypeA a;
+  ViewTypeB b;
+
+  typedef Kokkos::Rank<5,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewCopy-5D",
+       policy_type({0,0,0,0,0},{a.extent(0),a.extent(1),a.extent(2),
+                                a.extent(3),a.extent(4)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i2,
+                   const iType& i3, const iType& i4) const {
+      a(i0,i1,i2,i3,i4) = b(i0,i1,i2,i3,i4);
+  };
+};
+
+template<class ViewTypeA,class ViewTypeB, class Layout, class ExecSpace,typename iType>
+struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,6,iType> {
+  ViewTypeA a;
+  ViewTypeB b;
+
+  typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewCopy-6D",
+       policy_type({0,0,0,0,0,0},{a.extent(0),a.extent(1),a.extent(2),
+                                  a.extent(3),a.extent(4),a.extent(5)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i2,
+                   const iType& i3, const iType& i4, const iType& i5) const {
+      a(i0,i1,i2,i3,i4,i5) = b(i0,i1,i2,i3,i4,i5);
+  };
+};
+
+
+template<class ViewTypeA, class ViewTypeB, class Layout, class ExecSpace,typename iType>
+struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,7,iType> {
+  ViewTypeA a;
+  ViewTypeB b;
+
+  typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewCopy-7D",
+       policy_type({0,0,0,0,0,0},{a.extent(0),a.extent(1),a.extent(3),
+                                  a.extent(4),a.extent(5),a.extent(6)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i3,
+                   const iType& i4, const iType& i5, const iType& i6) const {
+    for(iType i2=0; i2<iType(a.extent(2));i2++)
+      a(i0,i1,i2,i3,i4,i5,i6) = b(i0,i1,i2,i3,i4,i5,i6);
+  };
+};
+
+template<class ViewTypeA,class ViewTypeB, class Layout, class ExecSpace,typename iType>
+struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,8,iType> {
+  ViewTypeA a;
+  ViewTypeB b;
+
+  typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;
+
+  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
+    ExecSpace::fence();
+    Kokkos::parallel_for("Kokkos::ViewCopy-8D",
+       policy_type({0,0,0,0,0,0},{a.extent(0),a.extent(1),a.extent(3),
+                                  a.extent(5),a.extent(6),a.extent(7)}),*this);
+    ExecSpace::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const iType& i0, const iType& i1, const iType& i3,
+                   const iType& i5, const iType& i6, const iType& i7) const {
+    for(iType i2=0; i2<iType(a.extent(2));i2++)
+    for(iType i4=0; i4<iType(a.extent(4));i4++)
+      a(i0,i1,i2,i3,i4,i5,i6,i7) = b(i0,i1,i2,i3,i4,i5,i6,i7);
+  };
+};
+
+
+template<class DstType, class SrcType>
+void view_copy(const DstType& dst, const SrcType& src) {
+  typedef typename DstType::execution_space dst_execution_space;
+  typedef typename SrcType::execution_space src_execution_space;
+  typedef typename DstType::memory_space dst_memory_space;
+  typedef typename SrcType::memory_space src_memory_space;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
+
+  enum { SrcExecCanAccessDst =
+   Kokkos::Impl::SpaceAccessibility< src_execution_space , dst_memory_space >::accessible };
+
+  if( ! DstExecCanAccessSrc && ! SrcExecCanAccessDst) {
+    std::string message("Error: Kokkos::deep_copy with no available copy mechanism: ");
+    message += src.label(); message += " to ";
+    message += dst.label();
+    Kokkos::Impl::throw_runtime_exception(message);
+  }
+
+  // Figure out iteration order in case we need it
+  int64_t strides[DstType::Rank+1];
+  dst.stride(strides);
+  Kokkos::Iterate iterate;
+  if        ( std::is_same<typename DstType::array_layout,Kokkos::LayoutRight>::value ) {
+    iterate = Kokkos::Iterate::Right;
+  } else if ( std::is_same<typename DstType::array_layout,Kokkos::LayoutLeft>::value ) {
+    iterate = Kokkos::Iterate::Left;
+  } else if ( std::is_same<typename DstType::array_layout,Kokkos::LayoutStride>::value ) {
+    if( strides[0] > strides[DstType::Rank-1] )
+      iterate = Kokkos::Iterate::Right;
+    else
+      iterate = Kokkos::Iterate::Left;
+  } else {
+    if( std::is_same<typename DstType::execution_space::array_layout, Kokkos::LayoutRight>::value )
+      iterate = Kokkos::Iterate::Right;
+    else
+      iterate = Kokkos::Iterate::Left;
+  }
+
+  if( (dst.span() >= size_t(std::numeric_limits<int>::max())) ||
+      (src.span() >= size_t(std::numeric_limits<int>::max())) ){
+    if(DstExecCanAccessSrc) {
+      if(iterate == Kokkos::Iterate::Right)
+        Kokkos::Impl::ViewCopy< DstType, SrcType, Kokkos::LayoutRight, dst_execution_space,
+                                DstType::Rank, int64_t >( dst , src );
+      else
+        Kokkos::Impl::ViewCopy< DstType, SrcType, Kokkos::LayoutLeft, dst_execution_space,
+                                DstType::Rank, int64_t >( dst , src );
+    } else {
+      if(iterate == Kokkos::Iterate::Right)
+        Kokkos::Impl::ViewCopy< DstType, SrcType, Kokkos::LayoutRight, src_execution_space,
+                                DstType::Rank, int64_t >( dst , src );
+      else
+        Kokkos::Impl::ViewCopy< DstType, SrcType, Kokkos::LayoutLeft, src_execution_space,
+                                DstType::Rank, int64_t >( dst , src );
+    }
+  } else {
+    if(DstExecCanAccessSrc) {
+      if(iterate == Kokkos::Iterate::Right)
+        Kokkos::Impl::ViewCopy< DstType, SrcType, Kokkos::LayoutRight, dst_execution_space,
+                                DstType::Rank, int >( dst , src );
+      else
+        Kokkos::Impl::ViewCopy< DstType, SrcType, Kokkos::LayoutLeft, dst_execution_space,
+                                DstType::Rank, int >( dst , src );
+    } else {
+      if(iterate == Kokkos::Iterate::Right)
+        Kokkos::Impl::ViewCopy< DstType, SrcType, Kokkos::LayoutRight, src_execution_space,
+                                DstType::Rank, int >( dst , src );
+      else
+        Kokkos::Impl::ViewCopy< DstType, SrcType, Kokkos::LayoutLeft, src_execution_space,
+                                DstType::Rank, int >( dst , src );
+    }
+
+  }
+}
+
+template<class DstType, class SrcType, int Rank, class ... Args>
+struct CommonSubview;
+
+template<class DstType, class SrcType, class Arg0, class ... Args>
+struct CommonSubview<DstType,SrcType,1,Arg0,Args...> {
+  typedef typename Kokkos::Subview<DstType,Arg0> dst_subview_type;
+  typedef typename Kokkos::Subview<SrcType,Arg0> src_subview_type;
+  dst_subview_type dst_sub;
+  src_subview_type src_sub;
+  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, Args... ):
+    dst_sub(dst,arg0),src_sub(src,arg0) {}
+};
+
+template<class DstType, class SrcType, class Arg0, class Arg1, class ... Args>
+struct CommonSubview<DstType,SrcType,2,Arg0,Arg1,Args...> {
+  typedef typename Kokkos::Subview<DstType,Arg0,Arg1> dst_subview_type;
+  typedef typename Kokkos::Subview<SrcType,Arg0,Arg1> src_subview_type;
+  dst_subview_type dst_sub;
+  src_subview_type src_sub;
+  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, const Arg1& arg1, Args... ):
+    dst_sub(dst,arg0,arg1),src_sub(src,arg0,arg1) {}
+};
+
+template<class DstType, class SrcType, class Arg0, class Arg1, class Arg2, class ... Args>
+struct CommonSubview<DstType,SrcType,3,Arg0,Arg1,Arg2,Args...> {
+  typedef typename Kokkos::Subview<DstType,Arg0,Arg1,Arg2> dst_subview_type;
+  typedef typename Kokkos::Subview<SrcType,Arg0,Arg1,Arg2> src_subview_type;
+  dst_subview_type dst_sub;
+  src_subview_type src_sub;
+  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, const Arg1& arg1,
+                const Arg2& arg2, Args... ):
+    dst_sub(dst,arg0,arg1,arg2),src_sub(src,arg0,arg1,arg2) {}
+};
+
+template<class DstType, class SrcType, class Arg0, class Arg1, class Arg2, class Arg3,
+         class ... Args>
+struct CommonSubview<DstType,SrcType,4,Arg0,Arg1,Arg2,Arg3,Args...> {
+  typedef typename Kokkos::Subview<DstType,Arg0,Arg1,Arg2,Arg3> dst_subview_type;
+  typedef typename Kokkos::Subview<SrcType,Arg0,Arg1,Arg2,Arg3> src_subview_type;
+  dst_subview_type dst_sub;
+  src_subview_type src_sub;
+  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, const Arg1& arg1,
+                const Arg2& arg2, const Arg3& arg3,
+                const Args ...):
+    dst_sub(dst,arg0,arg1,arg2,arg3),src_sub(src,arg0,arg1,arg2,arg3) {}
+};
+
+template<class DstType, class SrcType, class Arg0, class Arg1, class Arg2, class Arg3,
+         class Arg4, class ... Args>
+struct CommonSubview<DstType,SrcType,5,Arg0,Arg1,Arg2,Arg3,Arg4,Args...> {
+  typedef typename Kokkos::Subview<DstType,Arg0,Arg1,Arg2,Arg3,Arg4> dst_subview_type;
+  typedef typename Kokkos::Subview<SrcType,Arg0,Arg1,Arg2,Arg3,Arg4> src_subview_type;
+  dst_subview_type dst_sub;
+  src_subview_type src_sub;
+  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, const Arg1& arg1,
+                const Arg2& arg2, const Arg3& arg3, const Arg4& arg4,
+                const Args ...):
+    dst_sub(dst,arg0,arg1,arg2,arg3,arg4),src_sub(src,arg0,arg1,arg2,arg3,arg4) {}
+};
+
+template<class DstType, class SrcType, class Arg0, class Arg1, class Arg2, class Arg3,
+         class Arg4, class Arg5, class ... Args>
+struct CommonSubview<DstType,SrcType,6,Arg0,Arg1,Arg2,Arg3,Arg4,Arg5,Args...> {
+  typedef typename Kokkos::Subview<DstType,Arg0,Arg1,Arg2,Arg3,Arg4,Arg5> dst_subview_type;
+  typedef typename Kokkos::Subview<SrcType,Arg0,Arg1,Arg2,Arg3,Arg4,Arg5> src_subview_type;
+  dst_subview_type dst_sub;
+  src_subview_type src_sub;
+  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, const Arg1& arg1,
+                const Arg2& arg2, const Arg3& arg3, const Arg4& arg4, const Arg5& arg5,
+                const Args ...):
+    dst_sub(dst,arg0,arg1,arg2,arg3,arg4,arg5),src_sub(src,arg0,arg1,arg2,arg3,arg4,arg5) {}
+};
+
+template<class DstType, class SrcType, class Arg0, class Arg1, class Arg2, class Arg3,
+         class Arg4, class Arg5, class Arg6, class ...Args>
+struct CommonSubview<DstType,SrcType,7,Arg0,Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Args...> {
+  typedef typename Kokkos::Subview<DstType,Arg0,Arg1,Arg2,Arg3,Arg4,Arg5,Arg6> dst_subview_type;
+  typedef typename Kokkos::Subview<SrcType,Arg0,Arg1,Arg2,Arg3,Arg4,Arg5,Arg6> src_subview_type;
+  dst_subview_type dst_sub;
+  src_subview_type src_sub;
+  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, const Arg1& arg1,
+                const Arg2& arg2, const Arg3& arg3, const Arg4& arg4, const Arg5& arg5,
+                const Arg6& arg6, Args...):
+    dst_sub(dst,arg0,arg1,arg2,arg3,arg4,arg5,arg6),src_sub(src,arg0,arg1,arg2,arg3,arg4,arg5,arg6) {}
+};
+
+template<class DstType, class SrcType, class Arg0, class Arg1, class Arg2, class Arg3,
+         class Arg4, class Arg5, class Arg6, class Arg7>
+struct CommonSubview<DstType,SrcType,8,Arg0,Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7> {
+  typedef typename Kokkos::Subview<DstType,Arg0,Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7> dst_subview_type;
+  typedef typename Kokkos::Subview<SrcType,Arg0,Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7> src_subview_type;
+  dst_subview_type dst_sub;
+  src_subview_type src_sub;
+  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0, const Arg1& arg1,
+                const Arg2& arg2, const Arg3& arg3, const Arg4& arg4, const Arg5& arg5,
+                const Arg6& arg6, const Arg7& arg7):
+    dst_sub(dst,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7),src_sub(src,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7) {}
+};
+
+
+template<class DstType, class SrcType, class ExecSpace = typename DstType::execution_space, int Rank = DstType::Rank>
+struct ViewRemap;
+
+template<class DstType, class SrcType, class ExecSpace>
+struct ViewRemap<DstType,SrcType,ExecSpace,1> {
+  typedef Kokkos::pair<int64_t,int64_t> p_type;
+
+  ViewRemap(const DstType& dst, const SrcType& src) {
+    if(dst.extent(0) == src.extent(0)) {
+      view_copy(dst,src);
+    } else {
+      p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+      typedef CommonSubview<DstType,SrcType,1,p_type> sv_adapter_type;
+      sv_adapter_type common_subview(dst,src,ext0);
+      view_copy(common_subview.dst_sub,common_subview.src_sub);
+    }
+  }
+};
+
+template<class DstType, class SrcType, class ExecSpace>
+struct ViewRemap<DstType,SrcType,ExecSpace,2> {
+  typedef Kokkos::pair<int64_t,int64_t> p_type;
+
+  ViewRemap(const DstType& dst, const SrcType& src) {
+    if(dst.extent(0) == src.extent(0)) {
+      if(dst.extent(1) == src.extent(1)) {
+        view_copy(dst,src);
+      } else {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        typedef CommonSubview<DstType,SrcType,2,Kokkos::Impl::ALL_t,p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,ext1);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    } else {
+      if(dst.extent(1) == src.extent(1)) {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        typedef CommonSubview<DstType,SrcType,2,p_type,Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        typedef CommonSubview<DstType,SrcType,2,p_type,p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,ext1);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    }
+  }
+};
+
+template<class DstType, class SrcType, class ExecSpace>
+struct ViewRemap<DstType,SrcType,ExecSpace,3> {
+  typedef Kokkos::pair<int64_t,int64_t> p_type;
+
+  ViewRemap(const DstType& dst, const SrcType& src) {
+    if(dst.extent(0) == src.extent(0)) {
+      if(dst.extent(2) == src.extent(2)) {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        typedef CommonSubview<DstType,SrcType,3,Kokkos::Impl::ALL_t,p_type,Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,ext1,Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        typedef CommonSubview<DstType,SrcType,3,Kokkos::Impl::ALL_t,p_type,p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,ext1,ext2);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    } else {
+      if(dst.extent(2) == src.extent(2)) {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        typedef CommonSubview<DstType,SrcType,3,p_type,p_type,Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,ext1,Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        typedef CommonSubview<DstType,SrcType,3,p_type,p_type,p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,ext1,ext2);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    }
+  }
+};
+
+template<class DstType, class SrcType, class ExecSpace>
+struct ViewRemap<DstType,SrcType,ExecSpace,4> {
+  typedef Kokkos::pair<int64_t,int64_t> p_type;
+
+  ViewRemap(const DstType& dst, const SrcType& src) {
+    if(dst.extent(0) == src.extent(0)) {
+      if(dst.extent(3) == src.extent(3)) {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        typedef CommonSubview<DstType,SrcType,4,Kokkos::Impl::ALL_t,
+                              p_type,p_type,
+                              Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,
+                                       ext1,ext2,
+                                       Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        typedef CommonSubview<DstType,SrcType,4,Kokkos::Impl::ALL_t,
+                              p_type,p_type,
+                              p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,
+                                       ext1,ext2,
+                                       ext3);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    } else {
+      if(dst.extent(7) == src.extent(7)) {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        typedef CommonSubview<DstType,SrcType,4,p_type,
+                              p_type,p_type,
+                              Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,
+                                       ext1,ext2,
+                                       Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        typedef CommonSubview<DstType,SrcType,4,p_type,
+                              p_type,p_type,
+                              p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,
+                                       ext1,ext2,
+                                       ext3);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    }
+  }
+};
+
+template<class DstType, class SrcType, class ExecSpace>
+struct ViewRemap<DstType,SrcType,ExecSpace,5> {
+  typedef Kokkos::pair<int64_t,int64_t> p_type;
+
+  ViewRemap(const DstType& dst, const SrcType& src) {
+    if(dst.extent(0) == src.extent(0)) {
+      if(dst.extent(4) == src.extent(4)) {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        typedef CommonSubview<DstType,SrcType,5,Kokkos::Impl::ALL_t,
+                              p_type,p_type,p_type,
+                              Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,
+                                       ext1,ext2,ext3,
+                                       Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        typedef CommonSubview<DstType,SrcType,5,Kokkos::Impl::ALL_t,
+                              p_type,p_type,p_type,
+                              p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,
+                                       ext1,ext2,ext3,
+                                       ext4);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    } else {
+      if(dst.extent(4) == src.extent(4)) {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        typedef CommonSubview<DstType,SrcType,5,p_type,
+                              p_type,p_type,p_type,
+                              Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,
+                                       ext1,ext2,ext3,
+                                       Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        typedef CommonSubview<DstType,SrcType,5,p_type,
+                              p_type,p_type,p_type,
+                              p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,
+                                       ext1,ext2,ext3,
+                                       ext4);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    }
+  }
+};
+template<class DstType, class SrcType, class ExecSpace>
+struct ViewRemap<DstType,SrcType,ExecSpace,6> {
+  typedef Kokkos::pair<int64_t,int64_t> p_type;
+
+  ViewRemap(const DstType& dst, const SrcType& src) {
+    if(dst.extent(0) == src.extent(0)) {
+      if(dst.extent(5) == src.extent(5)) {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        typedef CommonSubview<DstType,SrcType,6,Kokkos::Impl::ALL_t,
+                              p_type,p_type,p_type,p_type,
+                              Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,
+                                       ext1,ext2,ext3,ext4,
+                                       Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        p_type ext5(0,std::min(dst.extent(5),src.extent(5)));
+        typedef CommonSubview<DstType,SrcType,6,Kokkos::Impl::ALL_t,
+                              p_type,p_type,p_type,p_type,
+                              p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,
+                                       ext1,ext2,ext3,ext4,
+                                       ext5);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    } else {
+      if(dst.extent(5) == src.extent(5)) {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+
+        typedef CommonSubview<DstType,SrcType,6,p_type,
+                              p_type,p_type,p_type,p_type,
+                              Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,
+                                       ext1,ext2,ext3,ext4,
+                                       Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        p_type ext5(0,std::min(dst.extent(5),src.extent(5)));
+
+        typedef CommonSubview<DstType,SrcType,6,p_type,
+                              p_type,p_type,p_type,p_type,
+                              p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,
+                                       ext1,ext2,ext3,ext4,
+                                       ext5);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    }
+  }
+};
+
+template<class DstType, class SrcType, class ExecSpace>
+struct ViewRemap<DstType,SrcType,ExecSpace,7> {
+  typedef Kokkos::pair<int64_t,int64_t> p_type;
+
+  ViewRemap(const DstType& dst, const SrcType& src) {
+    if(dst.extent(0) == src.extent(0)) {
+      if(dst.extent(6) == src.extent(6)) {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        p_type ext5(0,std::min(dst.extent(5),src.extent(5)));
+        typedef CommonSubview<DstType,SrcType,7,Kokkos::Impl::ALL_t,
+                              p_type,p_type,p_type,p_type,p_type,
+                              Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,
+                                       ext1,ext2,ext3,ext4,ext5,
+                                       Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        p_type ext5(0,std::min(dst.extent(5),src.extent(5)));
+        p_type ext6(0,std::min(dst.extent(6),src.extent(6)));
+        typedef CommonSubview<DstType,SrcType,7,Kokkos::Impl::ALL_t,
+                              p_type,p_type,p_type,p_type,p_type,
+                              p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,
+                                       ext1,ext2,ext3,ext4,ext5,
+                                       ext6);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    } else {
+      if(dst.extent(6) == src.extent(6)) {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        p_type ext5(0,std::min(dst.extent(5),src.extent(5)));
+        typedef CommonSubview<DstType,SrcType,7,p_type,
+                              p_type,p_type,p_type,p_type,p_type,
+                              Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,
+                                       ext1,ext2,ext3,ext4,ext5,
+                                       Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        p_type ext5(0,std::min(dst.extent(5),src.extent(5)));
+        p_type ext6(0,std::min(dst.extent(6),src.extent(6)));
+        typedef CommonSubview<DstType,SrcType,7,p_type,
+                              p_type,p_type,p_type,p_type,p_type,
+                              p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,
+                                       ext1,ext2,ext3,ext4,ext5,
+                                       ext6);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    }
+  }
+};
+
+template<class DstType, class SrcType, class ExecSpace>
+struct ViewRemap<DstType,SrcType,ExecSpace,8> {
+  typedef Kokkos::pair<int64_t,int64_t> p_type;
+
+  ViewRemap(const DstType& dst, const SrcType& src) {
+    if(dst.extent(0) == src.extent(0)) {
+      if(dst.extent(7) == src.extent(7)) {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        p_type ext5(0,std::min(dst.extent(5),src.extent(5)));
+        p_type ext6(0,std::min(dst.extent(6),src.extent(6)));
+        typedef CommonSubview<DstType,SrcType,8,Kokkos::Impl::ALL_t,
+                              p_type,p_type,p_type,p_type,p_type,p_type,
+                              Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,
+                                       ext1,ext2,ext3,ext4,ext5,ext6,
+                                       Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        p_type ext5(0,std::min(dst.extent(5),src.extent(5)));
+        p_type ext6(0,std::min(dst.extent(6),src.extent(6)));
+        p_type ext7(0,std::min(dst.extent(7),src.extent(7)));
+        typedef CommonSubview<DstType,SrcType,8,Kokkos::Impl::ALL_t,
+                              p_type,p_type,p_type,p_type,p_type,p_type,
+                              p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,Kokkos::ALL,
+                                       ext1,ext2,ext3,ext4,ext5,ext6,
+                                       ext7);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    } else {
+      if(dst.extent(7) == src.extent(7)) {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        p_type ext5(0,std::min(dst.extent(5),src.extent(5)));
+        p_type ext6(0,std::min(dst.extent(6),src.extent(6)));
+        typedef CommonSubview<DstType,SrcType,8,p_type,
+                              p_type,p_type,p_type,p_type,p_type,p_type,
+                              Kokkos::Impl::ALL_t> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,
+                                       ext1,ext2,ext3,ext4,ext5,ext6,
+                                       Kokkos::ALL);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      } else {
+        p_type ext0(0,std::min(dst.extent(0),src.extent(0)));
+        p_type ext1(0,std::min(dst.extent(1),src.extent(1)));
+        p_type ext2(0,std::min(dst.extent(2),src.extent(2)));
+        p_type ext3(0,std::min(dst.extent(3),src.extent(3)));
+        p_type ext4(0,std::min(dst.extent(4),src.extent(4)));
+        p_type ext5(0,std::min(dst.extent(5),src.extent(5)));
+        p_type ext6(0,std::min(dst.extent(6),src.extent(6)));
+        p_type ext7(0,std::min(dst.extent(7),src.extent(7)));
+        typedef CommonSubview<DstType,SrcType,8,p_type,
+                              p_type,p_type,p_type,p_type,p_type,p_type,
+                              p_type> sv_adapter_type;
+        sv_adapter_type common_subview(dst,src,ext0,
+                                       ext1,ext2,ext3,ext4,ext5,ext6,
+                                       ext7);
+        view_copy(common_subview.dst_sub,common_subview.src_sub);
+      }
+    }
+  }
+};
+
+}
+
+/** \brief  Deep copy a value from Host memory into a view.  */
+template< class DT , class ... DP >
+inline
+void deep_copy
+  ( const View<DT,DP...> & dst
+  , typename ViewTraits<DT,DP...>::const_value_type & value
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  typedef View<DT,DP...> ViewType;
+  if(dst.data() == NULL ) {
+    Kokkos::fence();
+    return;
+  }
+
+  Kokkos::fence();
+  static_assert(
+    std::is_same< typename ViewType::non_const_value_type ,
+                  typename ViewType::value_type >::value
+    , "deep_copy requires non-const type" );
+
+  // If contigous we can simply do a 1D flat loop
+  if(dst.span_is_contiguous()) {
+    typedef Kokkos::View<typename ViewType::value_type*,Kokkos::LayoutRight,
+        typename ViewType::device_type,Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+     ViewTypeFlat;
+
+    ViewTypeFlat dst_flat(dst.data(),dst.size());
+    Kokkos::Impl::ViewFill< ViewTypeFlat , Kokkos::LayoutLeft, typename ViewType::execution_space, ViewTypeFlat::Rank, int >( dst_flat , value );
+    Kokkos::fence();
+    return;
+  }
+
+  // Figure out iteration order to do the ViewFill
+  int64_t strides[ViewType::Rank+1];
+  dst.stride(strides);
+  Kokkos::Iterate iterate;
+  if        ( std::is_same<typename ViewType::array_layout,Kokkos::LayoutRight>::value ) {
+    iterate = Kokkos::Iterate::Right;
+  } else if ( std::is_same<typename ViewType::array_layout,Kokkos::LayoutLeft>::value ) {
+    iterate = Kokkos::Iterate::Left;
+  } else if ( std::is_same<typename ViewType::array_layout,Kokkos::LayoutStride>::value ) {
+    if( strides[0] > strides[ViewType::Rank>0?ViewType::Rank-1:0] )
+      iterate = Kokkos::Iterate::Right;
+    else
+      iterate = Kokkos::Iterate::Left;
+  } else {
+    if( std::is_same<typename ViewType::execution_space::array_layout, Kokkos::LayoutRight>::value )
+      iterate = Kokkos::Iterate::Right;
+    else
+      iterate = Kokkos::Iterate::Left;
+  }
+
+  // Lets call the right ViewFill functor based on integer space needed and iteration type
+  if(dst.span() > std::numeric_limits<int>::max()) {
+    if(iterate == Kokkos::Iterate::Right)
+      Kokkos::Impl::ViewFill< ViewType, Kokkos::LayoutRight, typename ViewType::execution_space, ViewType::Rank, int64_t >( dst , value );
+    else
+      Kokkos::Impl::ViewFill< ViewType, Kokkos::LayoutLeft, typename ViewType::execution_space, ViewType::Rank, int64_t >( dst , value );
+  } else {
+    if(iterate == Kokkos::Iterate::Right)
+      Kokkos::Impl::ViewFill< ViewType, Kokkos::LayoutRight, typename ViewType::execution_space, ViewType::Rank, int >( dst , value );
+    else
+      Kokkos::Impl::ViewFill< ViewType, Kokkos::LayoutLeft, typename ViewType::execution_space, ViewType::Rank, int >( dst , value );
+  }
+  Kokkos::fence();
+}
+
+/** \brief  Deep copy into a value in Host memory from a view.  */
+template< class ST , class ... SP >
+inline
+void deep_copy
+  ( typename ViewTraits<ST,SP...>::non_const_value_type & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  static_assert( ViewTraits<ST,SP...>::rank == 0
+               , "ERROR: Non-rank-zero view in deep_copy( value , View )" );
+
+  if(src.data() == NULL) {
+    Kokkos::fence();
+    return;
+  }
+
+  typedef ViewTraits<ST,SP...>               src_traits ;
+  typedef typename src_traits::memory_space  src_memory_space ;
+  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) );
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of compatible type, and rank zero.  */
+template< class DT , class ... DP , class ST , class ... SP >
+inline
+void deep_copy
+  ( const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) &&
+      unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) )
+  )>::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<ST,SP...>::non_const_value_type >::value
+    , "deep_copy requires matching non-const destination type" );
+
+  if(dst.data() == NULL && src.data() == NULL) {
+    Kokkos::fence();
+    return;
+  }
+
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
+
+  typedef typename dst_type::value_type    value_type ;
+  typedef typename dst_type::memory_space  dst_memory_space ;
+  typedef typename src_type::memory_space  src_memory_space ;
+
+  Kokkos::fence();
+  if ( dst.data() != src.data() ) {
+    Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) );
+    Kokkos::fence();
+  }
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of the default specialization, compatible type,
+ *          same non-zero rank, same contiguous layout.
+ */
+template< class DT , class ... DP , class ST , class ... SP >
+inline
+void deep_copy
+  ( const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) != 0 ||
+      unsigned(ViewTraits<ST,SP...>::rank) != 0 )
+  )>::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<DT,DP...>::non_const_value_type >::value
+    , "deep_copy requires non-const destination type" );
+
+  static_assert(
+    ( unsigned(ViewTraits<DT,DP...>::rank) ==
+      unsigned(ViewTraits<ST,SP...>::rank) )
+    , "deep_copy requires Views of equal rank" );
+
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
+
+  typedef typename dst_type::execution_space  dst_execution_space ;
+  typedef typename src_type::execution_space  src_execution_space ;
+  typedef typename dst_type::memory_space     dst_memory_space ;
+  typedef typename src_type::memory_space     src_memory_space ;
+  typedef typename dst_type::value_type       dst_value_type ;
+  typedef typename src_type::value_type       src_value_type ;
+  if(dst.data() == NULL && src.data() == NULL) {
+    Kokkos::fence();
+    return;
+  }
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
+
+  enum { SrcExecCanAccessDst =
+   Kokkos::Impl::SpaceAccessibility< src_execution_space , dst_memory_space >::accessible };
+
+
+  // Checking for Overlapping Views.
+  dst_value_type* dst_start = dst.data();
+  dst_value_type* dst_end   = dst.data() + dst.span();
+  src_value_type* src_start = src.data();
+  src_value_type* src_end   = src.data() + src.span();
+  if( ((std::ptrdiff_t)dst_start == (std::ptrdiff_t)src_start) &&
+      ((std::ptrdiff_t)dst_end   == (std::ptrdiff_t)src_end)   &&
+       (dst.span_is_contiguous() && src.span_is_contiguous()) ) {
+    Kokkos::fence();
+    return;
+  }
+
+  if( ( ( (std::ptrdiff_t)dst_start < (std::ptrdiff_t)src_end ) && ( (std::ptrdiff_t)dst_end > (std::ptrdiff_t)src_start ) ) &&
+      ( ( dst.span_is_contiguous() && src.span_is_contiguous() ))) {
+    std::string message("Error: Kokkos::deep_copy of overlapping views: ");
+    message += dst.label(); message += "(";
+    message += std::to_string((std::ptrdiff_t)dst_start); message += ",";
+    message += std::to_string((std::ptrdiff_t)dst_end); message += ") ";
+    message += src.label(); message += "(";
+    message += std::to_string((std::ptrdiff_t)src_start); message += ",";
+    message += std::to_string((std::ptrdiff_t)src_end); message += ") ";
+    Kokkos::Impl::throw_runtime_exception(message);
+  }
+
+  // Check for same extents
+  if ( (src.extent(0) != dst.extent(0)) ||
+       (src.extent(1) != dst.extent(1)) ||
+       (src.extent(2) != dst.extent(2)) ||
+       (src.extent(3) != dst.extent(3)) ||
+       (src.extent(4) != dst.extent(4)) ||
+       (src.extent(5) != dst.extent(5)) ||
+       (src.extent(6) != dst.extent(6)) ||
+       (src.extent(7) != dst.extent(7))
+     ) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+    Kokkos::fence();
+    if ( DstExecCanAccessSrc ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    }
+    else if ( SrcExecCanAccessDst ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Impl::ViewRemap< dst_type , src_type , src_execution_space >( dst , src );
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+    }
+    Kokkos::fence();
+    return;
+#else
+    std::string message("Deprecation Error: Kokkos::deep_copy extents of views don't match: ");
+    message += dst.label(); message += "(";
+    for(int r = 0; r<dst_type::Rank-1; r++)
+      { message+= std::to_string(dst.extent(r)); message += ","; }
+    message+= std::to_string(dst.extent(dst_type::Rank-1)); message += ") ";
+    message += src.label(); message += "(";
+    for(int r = 0; r<src_type::Rank-1; r++)
+      { message+= std::to_string(src.extent(r)); message += ","; }
+    message+= std::to_string(src.extent(src_type::Rank-1)); message += ") ";
+
+    Kokkos::Impl::throw_runtime_exception(message);
+#endif
+  }
+
+  // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
+
+  if ( std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                     typename ViewTraits<ST,SP...>::non_const_value_type >::value &&
+       (
+         std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                       typename ViewTraits<ST,SP...>::array_layout >::value
+         ||
+         ( ViewTraits<DT,DP...>::rank == 1 &&
+           ViewTraits<ST,SP...>::rank == 1 )
+       ) &&
+       dst.span_is_contiguous() &&
+       src.span_is_contiguous() &&
+       ((ViewTraits<DT,DP...>::rank < 1) || (dst.stride_0() == src.stride_0()))  &&
+       ((ViewTraits<DT,DP...>::rank < 2) || (dst.stride_1() == src.stride_1())) &&
+       ((ViewTraits<DT,DP...>::rank < 3) || (dst.stride_2() == src.stride_2())) &&
+       ((ViewTraits<DT,DP...>::rank < 4) || (dst.stride_3() == src.stride_3())) &&
+       ((ViewTraits<DT,DP...>::rank < 5) || (dst.stride_4() == src.stride_4())) &&
+       ((ViewTraits<DT,DP...>::rank < 6) || (dst.stride_5() == src.stride_5())) &&
+       ((ViewTraits<DT,DP...>::rank < 7) || (dst.stride_6() == src.stride_6())) &&
+       ((ViewTraits<DT,DP...>::rank < 8) || (dst.stride_7() == src.stride_7()))
+    ) {
+    const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+    Kokkos::fence();
+    if((void*)dst.data()!=(void*)src.data()) {
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >
+        ( dst.data() , src.data() , nbytes );
+    }
+    Kokkos::fence();
+  } else {
+    Kokkos::fence();
+    Impl::view_copy(dst,src);
+    Kokkos::fence();
+  }
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Deep copy a value from Host memory into a view.  */
+template< class ExecSpace ,class DT , class ... DP >
+inline
+void deep_copy
+  ( const ExecSpace &
+  , const View<DT,DP...> & dst
+  , typename ViewTraits<DT,DP...>::const_value_type & value
+  , typename std::enable_if<
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type ,
+                  typename ViewTraits<DT,DP...>::value_type >::value
+    , "deep_copy requires non-const type" );
+
+  ExecSpace::fence();
+  Kokkos::Impl::ViewFill< View<DT,DP...> >( dst , value );
+  ExecSpace::fence();
+}
+
+/** \brief  Deep copy into a value in Host memory from a view.  */
+template< class ExecSpace , class ST , class ... SP >
+inline
+void deep_copy
+  ( const ExecSpace & exec_space
+  , typename ViewTraits<ST,SP...>::non_const_value_type & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  static_assert( ViewTraits<ST,SP...>::rank == 0
+               , "ERROR: Non-rank-zero view in deep_copy( value , View )" );
+
+  if(src.data() == NULL) {
+    exec_space.fence();
+    return;
+  }
+
+  typedef ViewTraits<ST,SP...>               src_traits ;
+  typedef typename src_traits::memory_space  src_memory_space ;
+  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space , ExecSpace >
+    ( exec_space , & dst , src.data() , sizeof(ST) );
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of compatible type, and rank zero.  */
+template< class ExecSpace , class DT , class ... DP , class ST , class ... SP >
+inline
+void deep_copy
+  ( const ExecSpace & exec_space
+  , const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) &&
+      unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) )
+  )>::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<ST,SP...>::non_const_value_type >::value
+    , "deep_copy requires matching non-const destination type" );
+
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
+
+  typedef typename dst_type::value_type    value_type ;
+  typedef typename dst_type::memory_space  dst_memory_space ;
+  typedef typename src_type::memory_space  src_memory_space ;
+  if(dst.data() == NULL && src.data() == NULL) {
+    exec_space.fence();
+    return;
+  }
+
+  exec_space.fence();
+  if ( dst.data() != src.data() ) {
+    Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >
+      ( exec_space , dst.data() , src.data() , sizeof(value_type) );
+  }
+  exec_space.fence();
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of the default specialization, compatible type,
+ *          same non-zero rank
+ */
+template< class ExecSpace , class DT, class ... DP, class ST, class ... SP >
+inline
+void deep_copy
+  ( const ExecSpace & exec_space
+  , const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) != 0 ||
+      unsigned(ViewTraits<ST,SP...>::rank) != 0 )
+  )>::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<DT,DP...>::non_const_value_type >::value
+    , "deep_copy requires non-const destination type" );
+
+  static_assert(
+    ( unsigned(ViewTraits<DT,DP...>::rank) ==
+      unsigned(ViewTraits<ST,SP...>::rank) )
+    , "deep_copy requires Views of equal rank" );
+
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
+
+  typedef typename dst_type::execution_space  dst_execution_space ;
+  typedef typename src_type::execution_space  src_execution_space ;
+  typedef typename dst_type::memory_space     dst_memory_space ;
+  typedef typename src_type::memory_space     src_memory_space ;
+  typedef typename dst_type::value_type       dst_value_type ;
+  typedef typename src_type::value_type       src_value_type ;
+
+  if(dst.data() == NULL && src.data() == NULL) {
+    exec_space.fence();
+    return;
+  }
+
+  enum { ExecCanAccessSrcDst =
+      Kokkos::Impl::SpaceAccessibility< ExecSpace , dst_memory_space >::accessible &&
+      Kokkos::Impl::SpaceAccessibility< ExecSpace , src_memory_space >::accessible
+  };
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::SpaceAccessibility< dst_execution_space , src_memory_space >::accessible };
+
+  enum { SrcExecCanAccessDst =
+   Kokkos::Impl::SpaceAccessibility< src_execution_space , dst_memory_space >::accessible };
+
+  // Checking for Overlapping Views.
+  dst_value_type* dst_start = dst.data();
+  dst_value_type* dst_end   = dst.data() + dst.span();
+  src_value_type* src_start = src.data();
+  src_value_type* src_end   = src.data() + src.span();
+  if( ( ( (std::ptrdiff_t)dst_start < (std::ptrdiff_t)src_end ) && ( (std::ptrdiff_t)dst_end > (std::ptrdiff_t)src_start ) ) &&
+      ( ( dst.span_is_contiguous() && src.span_is_contiguous() ))) {
+    std::string message("Error: Kokkos::deep_copy of overlapping views: ");
+    message += dst.label(); message += "(";
+    message += std::to_string((std::ptrdiff_t)dst_start); message += ",";
+    message += std::to_string((std::ptrdiff_t)dst_end); message += ") ";
+    message += src.label(); message += "(";
+    message += std::to_string((std::ptrdiff_t)src_start); message += ",";
+    message += std::to_string((std::ptrdiff_t)src_end); message += ") ";
+    Kokkos::Impl::throw_runtime_exception(message);
+  }
+
+  // Check for same extents
+  if ( (src.extent(0) != dst.extent(0)) ||
+       (src.extent(1) != dst.extent(1)) ||
+       (src.extent(2) != dst.extent(2)) ||
+       (src.extent(3) != dst.extent(3)) ||
+       (src.extent(4) != dst.extent(4)) ||
+       (src.extent(5) != dst.extent(5)) ||
+       (src.extent(6) != dst.extent(6)) ||
+       (src.extent(7) != dst.extent(7))
+     ) {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+    exec_space.fence();
+    if ( ExecCanAccessSrcDst ) {
+      Kokkos::Impl::ViewRemap< dst_type , src_type , ExecSpace >( dst , src );
+    }
+    else if ( DstExecCanAccessSrc ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    }
+    else if ( SrcExecCanAccessDst ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Impl::ViewRemap< dst_type , src_type , src_execution_space >( dst , src );
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+    }
+    exec_space.fence();
+    return;
+#else
+    std::string message("Deprecation Error: Kokkos::deep_copy extents of views don't match: ");
+    message += dst.label(); message += "(";
+    for(int r = 0; r<dst_type::Rank-1; r++)
+      { message+= std::to_string(dst.extent(r)); message += ","; }
+    message+= std::to_string(dst.extent(dst_type::Rank-1)); message += ") ";
+    message += src.label(); message += "(";
+    for(int r = 0; r<src_type::Rank-1; r++)
+      { message+= std::to_string(src.extent(r)); message += ","; }
+    message+= std::to_string(src.extent(src_type::Rank-1)); message += ") ";
+
+    Kokkos::Impl::throw_runtime_exception(message);
+#endif
+  }
+
+  // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
+
+  if ( std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                     typename ViewTraits<ST,SP...>::non_const_value_type >::value &&
+       (
+         std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                       typename ViewTraits<ST,SP...>::array_layout >::value
+         ||
+         ( ViewTraits<DT,DP...>::rank == 1 &&
+           ViewTraits<ST,SP...>::rank == 1 )
+       ) &&
+       dst.span_is_contiguous() &&
+       src.span_is_contiguous() &&
+       ((ViewTraits<DT,DP...>::rank < 1) || (dst.stride_0() == src.stride_0()))  &&
+       ((ViewTraits<DT,DP...>::rank < 2) || (dst.stride_1() == src.stride_1())) &&
+       ((ViewTraits<DT,DP...>::rank < 3) || (dst.stride_2() == src.stride_2())) &&
+       ((ViewTraits<DT,DP...>::rank < 4) || (dst.stride_3() == src.stride_3())) &&
+       ((ViewTraits<DT,DP...>::rank < 5) || (dst.stride_4() == src.stride_4())) &&
+       ((ViewTraits<DT,DP...>::rank < 6) || (dst.stride_5() == src.stride_5())) &&
+       ((ViewTraits<DT,DP...>::rank < 7) || (dst.stride_6() == src.stride_6())) &&
+       ((ViewTraits<DT,DP...>::rank < 8) || (dst.stride_7() == src.stride_7()))
+    ) {
+
+    const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+    exec_space.fence();
+    if((void*)dst.data() != (void*)src.data()) {
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >
+        ( exec_space , dst.data() , src.data() , nbytes );
+    }
+    exec_space.fence();
+  } else {
+    exec_space.fence();
+    Impl::view_copy(dst,src);
+    exec_space.fence();
+  }
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class ... P >
+inline
+typename std::enable_if<
+  std::is_same<typename Kokkos::View<T,P...>::array_layout,Kokkos::LayoutLeft>::value ||
+  std::is_same<typename Kokkos::View<T,P...>::array_layout,Kokkos::LayoutRight>::value
+>::type
+resize( Kokkos::View<T,P...> & v ,
+             const size_t n0 = 0 ,
+             const size_t n1 = 0 ,
+             const size_t n2 = 0 ,
+             const size_t n3 = 0 ,
+             const size_t n4 = 0 ,
+             const size_t n5 = 0 ,
+             const size_t n6 = 0 ,
+             const size_t n7 = 0 )
+{
+  typedef Kokkos::View<T,P...>  view_type ;
+
+  static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
+
+  // Fix #904 by checking dimensions before actually resizing.
+  //
+  // Rank is known at compile time, so hopefully the compiler will
+  // remove branches that are compile-time false.  The upcoming "if
+  // constexpr" language feature would make this certain.
+  if (view_type::Rank == 1 &&
+      n0 == static_cast<size_t> (v.extent(0))) {
+    return;
+  }
+  if (view_type::Rank == 2 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1))) {
+    return;
+  }
+  if (view_type::Rank == 3 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2))) {
+    return;
+  }
+  if (view_type::Rank == 4 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3))) {
+    return;
+  }
+  if (view_type::Rank == 5 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4))) {
+    return;
+  }
+  if (view_type::Rank == 6 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4)) &&
+      n5 == static_cast<size_t> (v.extent(5))) {
+    return;
+  }
+  if (view_type::Rank == 7 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4)) &&
+      n5 == static_cast<size_t> (v.extent(5)) &&
+      n6 == static_cast<size_t> (v.extent(6))) {
+    return;
+  }
+  if (view_type::Rank == 8 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4)) &&
+      n5 == static_cast<size_t> (v.extent(5)) &&
+      n6 == static_cast<size_t> (v.extent(6)) &&
+      n7 == static_cast<size_t> (v.extent(7))) {
+    return;
+  }
+  // If Kokkos ever supports Views of rank > 8, the above code won't
+  // be incorrect, because avoiding reallocation in resize() is just
+  // an optimization.
+
+  // TODO (mfh 27 Jun 2017) If the old View has enough space but just
+  // different dimensions (e.g., if the product of the dimensions,
+  // including extra space for alignment, will not change), then
+  // consider just reusing storage.  For now, Kokkos always
+  // reallocates if any of the dimensions change, even if the old View
+  // has enough space.
+
+  view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 );
+
+  Kokkos::Impl::ViewRemap< view_type , view_type >( v_resized , v );
+
+  v = v_resized ;
+}
+
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class ... P >
+inline
+void resize(       Kokkos::View<T,P...> & v ,
+    const typename Kokkos::View<T,P...>::array_layout & layout)
+{
+  typedef Kokkos::View<T,P...>  view_type ;
+
+  static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
+
+  view_type v_resized( v.label(), layout );
+
+  Kokkos::Impl::ViewRemap< view_type , view_type >( v_resized , v );
+
+  v = v_resized ;
+}
+
+/** \brief  Resize a view with discarding old data. */
+template< class T , class ... P >
+inline
+typename std::enable_if<
+  std::is_same<typename Kokkos::View<T,P...>::array_layout,Kokkos::LayoutLeft>::value ||
+  std::is_same<typename Kokkos::View<T,P...>::array_layout,Kokkos::LayoutRight>::value
+>::type
+realloc( Kokkos::View<T,P...> & v ,
+              const size_t n0 = 0 ,
+              const size_t n1 = 0 ,
+              const size_t n2 = 0 ,
+              const size_t n3 = 0 ,
+              const size_t n4 = 0 ,
+              const size_t n5 = 0 ,
+              const size_t n6 = 0 ,
+              const size_t n7 = 0 )
+{
+  typedef Kokkos::View<T,P...>  view_type ;
+
+  static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
+
+  const std::string label = v.label();
+
+  v = view_type(); // Deallocate first, if the only view to allocation
+  v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 );
+}
+
+/** \brief  Resize a view with discarding old data. */
+template< class T , class ... P >
+inline
+void realloc(      Kokkos::View<T,P...> & v ,
+    const typename Kokkos::View<T,P...>::array_layout & layout)
+{
+  typedef Kokkos::View<T,P...>  view_type ;
+
+  static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
+
+  const std::string label = v.label();
+
+  v = view_type(); // Deallocate first, if the only view to allocation
+  v = view_type( label, layout );
+}
+} /* namespace Kokkos */
+
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4d664c8eaf1e41e8b792ed3e4e351c01c6e3009
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Core.hpp
@@ -0,0 +1,217 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_HPP
+#define KOKKOS_CORE_HPP
+
+//----------------------------------------------------------------------------
+// Include the execution space header files for the enabled execution spaces.
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ENABLE_SERIAL )
+#include <Kokkos_Serial.hpp>
+#endif
+
+#if defined( KOKKOS_ENABLE_OPENMP )
+#include <Kokkos_OpenMP.hpp>
+#endif
+
+//#if defined( KOKKOS_ENABLE_OPENMPTARGET )
+#include <Kokkos_OpenMPTarget.hpp>
+#include <Kokkos_OpenMPTargetSpace.hpp>
+//#endif
+
+#if defined( KOKKOS_ENABLE_QTHREADS )
+#include <Kokkos_Qthreads.hpp>
+#endif
+
+#if defined( KOKKOS_ENABLE_THREADS )
+#include <Kokkos_Threads.hpp>
+#endif
+
+#if defined( KOKKOS_ENABLE_CUDA )
+#include <Kokkos_Cuda.hpp>
+#endif
+
+#if defined( KOKKOS_ENABLE_ROCM )
+#include <Kokkos_ROCm.hpp>
+#endif
+
+#include <Kokkos_AnonymousSpace.hpp>
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_MemoryPool.hpp>
+#include <Kokkos_Array.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_Vectorization.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_hwloc.hpp>
+#include <Kokkos_Timer.hpp>
+
+#include <Kokkos_Complex.hpp>
+
+#include <Kokkos_CopyViews.hpp>
+#include <functional>
+#include <iosfwd>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+struct InitArguments {
+  int num_threads;
+  int num_numa;
+  int device_id;
+  bool disable_warnings;
+
+  InitArguments( int nt = -1
+               , int nn = -1
+               , int dv = -1
+               , bool dw = false
+               )
+    : num_threads{ nt }
+    , num_numa{ nn }
+    , device_id{ dv }
+    , disable_warnings{ dw }
+  {}
+};
+
+void initialize(int& narg, char* arg[]);
+
+void initialize(const InitArguments& args = InitArguments());
+
+bool is_initialized() noexcept;
+
+bool show_warnings() noexcept;
+
+/** \brief  Finalize the spaces that were initialized via Kokkos::initialize */
+void finalize();
+
+/**
+ * \brief Push a user-defined function to be called in
+ *   Kokkos::finalize, before any Kokkos state is finalized.
+ *
+ * \warning Only call this after Kokkos::initialize, but before
+ *   Kokkos::finalize.
+ *
+ * This function is the Kokkos analog to std::atexit.  If you call
+ * this with a function f, then your function will get called when
+ * Kokkos::finalize is called.  Specifically, it will be called BEFORE
+ * Kokkos does any finalization.  This means that all execution
+ * spaces, memory spaces, etc. that were initialized will still be
+ * initialized when your function is called.
+ *
+ * Just like std::atexit, if you call push_finalize_hook in sequence
+ * with multiple functions (f, g, h), Kokkos::finalize will call them
+ * in reverse order (h, g, f), as if popping a stack.  Furthermore,
+ * just like std::atexit, if any of your functions throws but does not
+ * catch an exception, Kokkos::finalize will call std::terminate.
+ */
+void push_finalize_hook(std::function<void()> f);
+
+/** \brief  Finalize all known execution spaces */
+void finalize_all();
+
+void fence();
+
+/** \brief Print "Bill of Materials" */
+void print_configuration( std::ostream & , const bool detail = false );
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/* Allocate memory from a memory space.
+ * The allocation is tracked in Kokkos memory tracking system, so
+ * leaked memory can be identified.
+ */
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void * kokkos_malloc( const std::string & arg_alloc_label
+                    , const size_t arg_alloc_size )
+{
+  typedef typename Space::memory_space MemorySpace ;
+  return Impl::SharedAllocationRecord< MemorySpace >::
+    allocate_tracked( MemorySpace() , arg_alloc_label , arg_alloc_size );
+}
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void * kokkos_malloc( const size_t arg_alloc_size )
+{
+  typedef typename Space::memory_space MemorySpace ;
+  return Impl::SharedAllocationRecord< MemorySpace >::
+    allocate_tracked( MemorySpace() , "no-label" , arg_alloc_size );
+}
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void kokkos_free( void * arg_alloc )
+{
+  typedef typename Space::memory_space MemorySpace ;
+  return Impl::SharedAllocationRecord< MemorySpace >::
+    deallocate_tracked( arg_alloc );
+}
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
+{
+  typedef typename Space::memory_space MemorySpace ;
+  return Impl::SharedAllocationRecord< MemorySpace >::
+    reallocate_tracked( arg_alloc , arg_alloc_size );
+}
+
+} // namespace Kokkos
+
+#include <Kokkos_Crs.hpp>
+#include <Kokkos_WorkGraphPolicy.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0d4d8a49a8b8adecdb6468a855a2da611f8c66a
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -0,0 +1,305 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_FWD_HPP
+#define KOKKOS_CORE_FWD_HPP
+
+//----------------------------------------------------------------------------
+// Kokkos_Macros.hpp does introspection on configuration options
+// and compiler environment then sets a collection of #define macros.
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Utilities.hpp>
+
+#include <Kokkos_UniqueToken.hpp>
+#include <Kokkos_MasterLock.hpp>
+
+//----------------------------------------------------------------------------
+// Have assumed a 64bit build (8byte pointers) throughout the code base.
+
+static_assert( sizeof(void*) == 8
+             , "Kokkos assumes 64-bit build; i.e., 8-byte pointers" );
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+struct AUTO_t {
+  KOKKOS_INLINE_FUNCTION
+  constexpr const AUTO_t & operator()() const { return *this; }
+};
+
+namespace {
+/**\brief Token to indicate that a parameter's value is to be automatically selected */
+constexpr AUTO_t AUTO = Kokkos::AUTO_t();
+}
+
+struct InvalidType {};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+// Forward declarations for class inter-relationships
+
+namespace Kokkos {
+
+class HostSpace; ///< Memory space for main process and CPU execution spaces
+
+#ifdef KOKKOS_ENABLE_HBWSPACE
+namespace Experimental {
+class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
+}
+#endif
+
+#if defined( KOKKOS_ENABLE_SERIAL )
+class Serial;    ///< Execution space main process on CPU.
+#endif
+
+#if defined( KOKKOS_ENABLE_QTHREADS )
+class Qthreads;  ///< Execution space with Qthreads back-end.
+#endif
+
+#if defined( KOKKOS_ENABLE_THREADS )
+class Threads;   ///< Execution space with pthreads back-end.
+#endif
+
+#if defined( KOKKOS_ENABLE_OPENMP )
+class OpenMP;    ///< OpenMP execution space.
+#endif
+
+#if defined( KOKKOS_ENABLE_OPENMPTARGET )
+namespace Experimental {
+class OpenMPTarget;    ///< OpenMPTarget execution space.
+class OpenMPTargetSpace;
+}
+#endif
+
+
+#if defined( KOKKOS_ENABLE_CUDA )
+class CudaSpace;            ///< Memory space on Cuda GPU
+class CudaUVMSpace;         ///< Memory space on Cuda GPU with UVM
+class CudaHostPinnedSpace;  ///< Memory space on Host accessible to Cuda GPU
+class Cuda;                 ///< Execution space for Cuda GPU
+#endif
+
+#if defined( KOKKOS_ENABLE_ROCM )
+namespace Experimental {
+class ROCmSpace ;            ///< Memory space on ROCm GPU
+class ROCm ;                 ///< Execution space for ROCm GPU
+}
+#endif
+
+template<class ExecutionSpace, class MemorySpace>
+struct Device;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+// Set the default execution space.
+
+/// Define Kokkos::DefaultExecutionSpace as per configuration option
+/// or chosen from the enabled execution spaces in the following order:
+/// Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
+
+namespace Kokkos {
+
+#if   defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
+  typedef Cuda DefaultExecutionSpace;
+#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
+  typedef Experimental::OpenMPTarget DefaultExecutionSpace ;
+#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM )
+  typedef Experimental::ROCm DefaultExecutionSpace ;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Qthreads DefaultExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultExecutionSpace;
+#else
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
+#endif
+
+#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultHostExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Qthreads DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_THREADS )
+  typedef Threads DefaultHostExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Qthreads DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace;
+#else
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
+#endif
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+// Detect the active execution space and define its memory space.
+// This is used to verify whether a running kernel can access
+// a given memory space.
+
+namespace Kokkos {
+
+namespace Impl {
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined( KOKKOS_ENABLE_CUDA )
+typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace;
+#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU )
+typedef Kokkos::HostSpace  ActiveExecutionMemorySpace ;
+#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+typedef Kokkos::HostSpace  ActiveExecutionMemorySpace;
+#else
+typedef void ActiveExecutionMemorySpace;
+#endif
+
+template< class ActiveSpace, class MemorySpace >
+struct VerifyExecutionCanAccessMemorySpace {
+  enum {value = 0};
+};
+
+template< class Space >
+struct VerifyExecutionCanAccessMemorySpace< Space, Space >
+{
+  enum {value = 1};
+  KOKKOS_INLINE_FUNCTION static void verify(void) {}
+  KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE, DATA_PTR ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify( DATA_PTR )
+
+#define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify()
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+  void fence();
+}
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<class ViewType, class Layout = typename ViewType::array_layout,
+         class ExecSpace = typename ViewType::execution_space, int Rank = ViewType::Rank, typename iType = int64_t>
+struct ViewFill;
+
+template<class ViewTypeA,class ViewTypeB, class Layout, class ExecSpace, int Rank, typename iType>
+struct ViewCopy;
+
+template< class Functor
+        , class Policy
+        , class EnableFunctor = void
+        , class EnablePolicy = void
+        >
+struct FunctorPolicyExecutionSpace;
+
+//----------------------------------------------------------------------------
+/// \class ParallelFor
+/// \brief Implementation of the ParallelFor operator that has a
+///   partial specialization for the device.
+///
+/// This is an implementation detail of parallel_for.  Users should
+/// skip this and go directly to the nonmember function parallel_for.
+template< class FunctorType, class ExecPolicy, class ExecutionSpace =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelFor;
+
+/// \class ParallelReduce
+/// \brief Implementation detail of parallel_reduce.
+///
+/// This is an implementation detail of parallel_reduce.  Users should
+/// skip this and go directly to the nonmember function parallel_reduce.
+template< class FunctorType, class ExecPolicy, class ReducerType = InvalidType, class ExecutionSpace =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelReduce;
+
+/// \class ParallelScan
+/// \brief Implementation detail of parallel_scan.
+///
+/// This is an implementation detail of parallel_scan.  Users should
+/// skip this and go directly to the documentation of the nonmember
+/// template function Kokkos::parallel_scan.
+template< class FunctorType, class ExecPolicy, class ExecutionSapce =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelScan;
+
+} // namespace Impl
+
+namespace Experimental {
+template<class ScalarType , class Space = HostSpace> struct Sum;
+template<class ScalarType , class Space = HostSpace> struct Prod;
+template<class ScalarType , class Space = HostSpace> struct Min;
+template<class ScalarType , class Space = HostSpace> struct Max;
+template<class ScalarType , class Space = HostSpace> struct MinMax;
+template<class ScalarType , class Index, class Space = HostSpace> struct MinLoc;
+template<class ScalarType , class Index, class Space = HostSpace> struct MaxLoc;
+template<class ScalarType , class Index, class Space = HostSpace> struct MinMaxLoc;
+template<class ScalarType , class Space = HostSpace> struct BAnd;
+template<class ScalarType , class Space = HostSpace> struct BOr;
+template<class ScalarType , class Space = HostSpace> struct LAnd;
+template<class ScalarType , class Space = HostSpace> struct LOr;
+}
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_Crs.hpp b/packages/kokkos/core/src/Kokkos_Crs.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..09b0d666a1f8c10c0bd3f616fc333515ac688b1d
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Crs.hpp
@@ -0,0 +1,456 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CRS_HPP
+#define KOKKOS_CRS_HPP
+
+namespace Kokkos {
+
+/// \class Crs
+/// \brief Compressed row storage array.
+///
+/// \tparam DataType The type of stored entries.  If a Crs is
+///   used as the graph of a sparse matrix, then this is usually an
+///   integer type, the type of the column indices in the sparse
+///   matrix.
+///
+/// \tparam Arg1Type The second template parameter, corresponding
+///   either to the Device type (if there are no more template
+///   parameters) or to the Layout type (if there is at least one more
+///   template parameter).
+///
+/// \tparam Arg2Type The third template parameter, which if provided
+///   corresponds to the Device type.
+///
+/// \tparam SizeType The type of row offsets.  Usually the default
+///   parameter suffices.  However, setting a nondefault value is
+///   necessary in some cases, for example, if you want to have a
+///   sparse matrices with dimensions (and therefore column indices)
+///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
+///   entries in the sparse matrix.
+///
+/// A row has a range of entries:
+/// <ul>
+/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
+/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
+/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
+/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
+/// </ul>
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type = void,
+          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
+class Crs {
+protected:
+  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
+
+public:
+  typedef DataType                                            data_type;
+  typedef typename traits::array_layout                       array_layout;
+  typedef typename traits::execution_space                    execution_space;
+  typedef typename traits::memory_space                       memory_space;
+  typedef typename traits::device_type                        device_type;
+  typedef SizeType                                            size_type;
+
+  typedef Crs< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type;
+  typedef Crs< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror;
+  typedef View<size_type* , array_layout, device_type> row_map_type;
+  typedef View<DataType*  , array_layout, device_type> entries_type;
+
+  row_map_type row_map;
+  entries_type entries;
+
+  //! Construct an empty view.
+  Crs() : row_map(), entries() {}
+
+  //! Copy constructor (shallow copy).
+  Crs(const Crs& rhs) : row_map(rhs.row_map), entries(rhs.entries)
+  {}
+
+  template<class EntriesType, class RowMapType>
+  Crs(const RowMapType& row_map_, const EntriesType& entries_) : row_map(row_map_), entries(entries_)
+  {}
+
+  /** \brief  Assign to a view of the rhs array.
+   *          If the old view is the last view
+   *          then allocated memory is deallocated.
+   */
+  Crs& operator= (const Crs& rhs) {
+    row_map = rhs.row_map;
+    entries = rhs.entries;
+    return *this;
+  }
+
+  /**  \brief  Destroy this view of the array.
+   *           If the last view then allocated memory is deallocated.
+   */
+  ~Crs() {}
+
+  /**  \brief  Return number of rows in the graph
+   */
+  KOKKOS_INLINE_FUNCTION
+  size_type numRows() const {
+    return (row_map.extent(0) != 0) ?
+      row_map.extent(0) - static_cast<size_type> (1) :
+      static_cast<size_type> (0);
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class OutCounts,
+          class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void get_crs_transpose_counts(
+    OutCounts& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in,
+    std::string const& name = "transpose_counts");
+
+template< class OutCounts,
+          class InCrs>
+typename OutCounts::value_type get_crs_row_map_from_counts(
+    OutCounts& out,
+    InCrs const& in,
+    std::string const& name = "row_map");
+
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void transpose_crs(
+    Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in);
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template <class InCrs, class OutCounts>
+class GetCrsTransposeCounts {
+ public:
+  using execution_space = typename InCrs::execution_space;
+  using self_type = GetCrsTransposeCounts<InCrs, OutCounts>;
+  using index_type = typename InCrs::size_type;
+ private:
+  InCrs in;
+  OutCounts out;
+ public:
+  KOKKOS_INLINE_FUNCTION
+  void operator()(index_type i) const {
+    atomic_increment( &out[in.entries(i)] );
+  }
+  GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out):
+    in(arg_in),out(arg_out) {
+    using policy_type = RangePolicy<index_type, execution_space>;
+    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+    const closure_type closure(*this, policy_type(0, index_type(in.entries.size())));
+    closure.execute();
+    execution_space::fence();
+  }
+};
+
+template <class InCounts, class OutRowMap>
+class CrsRowMapFromCounts {
+ public:
+  using execution_space = typename InCounts::execution_space;
+  using value_type = typename OutRowMap::value_type;
+  using index_type = typename InCounts::size_type;
+  using last_value_type = Kokkos::View<value_type, execution_space>;
+ private:
+  InCounts m_in;
+  OutRowMap m_out;
+  last_value_type m_last_value;
+ public:
+  KOKKOS_INLINE_FUNCTION
+  void operator()(index_type i, value_type& update, bool final_pass) const {
+    if (i < m_in.size()) {
+      update += m_in(i);
+      if (final_pass) m_out(i + 1) = update;
+    } else if (final_pass) {
+      m_out(0) = 0;
+      m_last_value() = update;
+    }
+  }
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type& update) const { update = 0; }
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& update, const volatile value_type& input) const {
+    update += input;
+  }
+  using self_type = CrsRowMapFromCounts<InCounts, OutRowMap>;
+  CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out):
+    m_in(arg_in), m_out(arg_out), m_last_value("last_value") {
+  }
+  value_type execute() {
+    using policy_type = RangePolicy<index_type, execution_space>;
+    using closure_type = Kokkos::Impl::ParallelScan<self_type, policy_type>;
+    closure_type closure(*this, policy_type(0, m_in.size() + 1));
+    closure.execute();
+    auto last_value = Kokkos::create_mirror_view(m_last_value);
+    Kokkos::deep_copy(last_value, m_last_value);
+    return last_value();
+  }
+};
+
+template <class InCrs, class OutCrs>
+class FillCrsTransposeEntries {
+ public:
+  using execution_space = typename InCrs::execution_space;
+  using memory_space = typename InCrs::memory_space;
+  using value_type = typename OutCrs::entries_type::value_type;
+  using index_type = typename InCrs::size_type;
+ private:
+  using counters_type = View<index_type*, memory_space>;
+  InCrs in;
+  OutCrs out;
+  counters_type counters;
+ public:
+  KOKKOS_INLINE_FUNCTION
+  void operator()(index_type i) const {
+    auto begin = in.row_map(i);
+    auto end = in.row_map(i + 1);
+    for (auto j = begin; j < end; ++j) {
+      auto ti = in.entries(j);
+      auto tbegin = out.row_map(ti);
+      auto tj = atomic_fetch_add( &counters(ti), 1 );
+      out.entries( tbegin + tj ) = i;
+    }
+  }
+  using self_type = FillCrsTransposeEntries<InCrs, OutCrs>;
+  FillCrsTransposeEntries(InCrs const& arg_in, OutCrs const& arg_out):
+    in(arg_in),out(arg_out),
+    counters("counters", arg_out.numRows()) {
+    using policy_type = RangePolicy<index_type, execution_space>;
+    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+    const closure_type closure(*this, policy_type(0, index_type(in.numRows())));
+    closure.execute();
+    execution_space::fence();
+  }
+};
+
+}} // namespace Kokkos::Impl
+
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+template< class OutCounts,
+          class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void get_crs_transpose_counts(
+    OutCounts& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in,
+    std::string const& name) {
+  using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>;
+  out = OutCounts(name, in.numRows());
+  Kokkos::Impl::GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
+}
+
+template< class OutRowMap,
+          class InCounts>
+typename OutRowMap::value_type get_crs_row_map_from_counts(
+    OutRowMap& out,
+    InCounts const& in,
+    std::string const& name) {
+  out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
+  Kokkos::Impl::CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
+  return functor.execute();
+}
+
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void transpose_crs(
+    Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in)
+{
+  typedef Crs<DataType, Arg1Type, Arg2Type, SizeType> crs_type ;
+  typedef typename crs_type::memory_space             memory_space ;
+  typedef View<SizeType*, memory_space>               counts_type ;
+  {
+  counts_type counts;
+  Kokkos::get_crs_transpose_counts(counts, in);
+  Kokkos::get_crs_row_map_from_counts(out.row_map, counts,
+      "tranpose_row_map");
+  }
+  out.entries = decltype(out.entries)("transpose_entries", in.entries.size());
+  Kokkos::Impl::
+    FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
+}
+
+template< class CrsType,
+          class Functor,
+          class ExecutionSpace = typename CrsType::execution_space>
+struct CountAndFillBase;
+
+template< class CrsType,
+          class Functor,
+          class ExecutionSpace>
+struct CountAndFillBase {
+  using data_type = typename CrsType::size_type;
+  using size_type = typename CrsType::size_type;
+  using row_map_type = typename CrsType::row_map_type;
+  using counts_type = row_map_type;
+  CrsType m_crs;
+  Functor m_functor;
+  counts_type m_counts;
+  struct Count {};
+  inline void operator()(Count, size_type i) const {
+    m_counts(i) = m_functor(i, nullptr);
+  }
+  struct Fill {};
+  inline void operator()(Fill, size_type i) const {
+    auto j = m_crs.row_map(i);
+    /* we don't want to access entries(entries.size()), even if its just to get its
+       address and never use it.
+       this can happen when row (i) is empty and all rows after it are also empty.
+       we could compare to row_map(i + 1), but that is a read from global memory,
+       whereas dimension_0() should be part of the View in registers (or constant memory) */
+    data_type* fill =
+      (j == static_cast<decltype(j)>(m_crs.entries.extent(0))) ?
+      nullptr : (&(m_crs.entries(j)));
+    m_functor(i, fill);
+  }
+  CountAndFillBase(CrsType& crs, Functor const& f):
+    m_crs(crs),
+    m_functor(f)
+  {}
+};
+
+#if defined( KOKKOS_ENABLE_CUDA )
+template< class CrsType,
+          class Functor>
+struct CountAndFillBase<CrsType, Functor, Kokkos::Cuda> {
+  using data_type = typename CrsType::size_type;
+  using size_type = typename CrsType::size_type;
+  using row_map_type = typename CrsType::row_map_type;
+  using counts_type = row_map_type;
+  CrsType m_crs;
+  Functor m_functor;
+  counts_type m_counts;
+  struct Count {};
+  __device__ inline void operator()(Count, size_type i) const {
+    m_counts(i) = m_functor(i, nullptr);
+  }
+  struct Fill {};
+  __device__ inline void operator()(Fill, size_type i) const {
+    auto j = m_crs.row_map(i);
+    /* we don't want to access entries(entries.size()), even if its just to get its
+       address and never use it.
+       this can happen when row (i) is empty and all rows after it are also empty.
+       we could compare to row_map(i + 1), but that is a read from global memory,
+       whereas dimension_0() should be part of the View in registers (or constant memory) */
+    data_type* fill =
+      (j == static_cast<decltype(j)>(m_crs.entries.extent(0))) ?
+      nullptr : (&(m_crs.entries(j)));
+    m_functor(i, fill);
+  }
+  CountAndFillBase(CrsType& crs, Functor const& f):
+    m_crs(crs),
+    m_functor(f)
+  {}
+};
+#endif
+
+template< class CrsType,
+          class Functor>
+struct CountAndFill : public CountAndFillBase<CrsType, Functor> {
+  using base_type = CountAndFillBase<CrsType, Functor>;
+  using typename base_type::data_type;
+  using typename base_type::size_type;
+  using typename base_type::counts_type;
+  using typename base_type::Count;
+  using typename base_type::Fill;
+  using entries_type = typename CrsType::entries_type;
+  using self_type = CountAndFill<CrsType, Functor>;
+  CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
+    base_type(crs, f)
+  {
+    using execution_space = typename CrsType::execution_space;
+    this->m_counts = counts_type("counts", nrows);
+    {
+    using count_policy_type = RangePolicy<size_type, execution_space, Count>;
+    using count_closure_type =
+      Kokkos::Impl::ParallelFor<self_type, count_policy_type>;
+    const count_closure_type closure(*this, count_policy_type(0, nrows));
+    closure.execute();
+    }
+    auto nentries = Kokkos::
+      get_crs_row_map_from_counts(this->m_crs.row_map, this->m_counts);
+    this->m_counts = counts_type();
+    this->m_crs.entries = entries_type("entries", nentries);
+    {
+    using fill_policy_type = RangePolicy<size_type, execution_space, Fill>;
+    using fill_closure_type =
+      Kokkos::Impl::ParallelFor<self_type, fill_policy_type>;
+    const fill_closure_type closure(*this, fill_policy_type(0, nrows));
+    closure.execute();
+    }
+    crs = this->m_crs;
+  }
+};
+
+template< class CrsType,
+          class Functor>
+void count_and_fill_crs(
+    CrsType& crs,
+    typename CrsType::size_type nrows,
+    Functor const& f) {
+  Kokkos::CountAndFill<CrsType, Functor>(crs, nrows, f);
+}
+
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_CRS_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Kokkos_Cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c2db88fdc2aade42999d92ab2e09e15670b12e5c
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Cuda.hpp
@@ -0,0 +1,305 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_HPP
+#define KOKKOS_CUDA_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_CUDA )
+
+#include <Kokkos_Core_fwd.hpp>
+
+#include <iosfwd>
+#include <vector>
+
+#include <Kokkos_CudaSpace.hpp>
+
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskScheduler.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class CudaExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class Cuda
+/// \brief Kokkos Execution Space that uses CUDA to run on GPUs.
+///
+/// An "execution space" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads execution space uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP execution space uses the OpenMP language
+/// extensions, and the Serial execution space executes "parallel" kernels
+/// sequentially.  The Cuda execution space uses NVIDIA's CUDA programming
+/// model to execute kernels in parallel on GPUs.
+class Cuda {
+public:
+  //! \name Type declarations that all Kokkos execution spaces must provide.
+  //@{
+
+  //! Tag this class as a kokkos execution space
+  typedef Cuda                  execution_space ;
+
+#if defined( KOKKOS_ENABLE_CUDA_UVM )
+  //! This execution space's preferred memory space.
+  typedef CudaUVMSpace          memory_space ;
+#else
+  //! This execution space's preferred memory space.
+  typedef CudaSpace             memory_space ;
+#endif
+
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  //! The size_type best suited for this execution space.
+  typedef memory_space::size_type  size_type ;
+
+  //! This execution space's preferred array layout.
+  typedef LayoutLeft            array_layout ;
+
+  //!
+  typedef ScratchMemorySpace< Cuda >  scratch_memory_space ;
+
+  //@}
+  //--------------------------------------------------
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  KOKKOS_INLINE_FUNCTION static int in_parallel() {
+#if defined( __CUDA_ARCH__ )
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  //! Free any resources being consumed by the device.
+  static void finalize();
+
+  //! Has been initialized
+  static int is_initialized();
+
+  /** \brief  Return the maximum amount of concurrency.  */
+  static int concurrency();
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  //--------------------------------------------------
+  //! \name  Cuda space instances
+
+  ~Cuda() {}
+  Cuda();
+  explicit Cuda( const int instance_id );
+
+  Cuda( Cuda && ) = default ;
+  Cuda( const Cuda & ) = default ;
+  Cuda & operator = ( Cuda && ) = default ;
+  Cuda & operator = ( const Cuda & ) = default ;
+
+  //--------------------------------------------------------------------------
+  //! \name Device-specific functions
+  //@{
+
+  struct SelectDevice {
+    int cuda_device_id ;
+    SelectDevice() : cuda_device_id(0) {}
+    explicit SelectDevice( int id ) : cuda_device_id( id ) {}
+  };
+
+  //! Initialize, telling the CUDA run-time library which device to use.
+  static void initialize( const SelectDevice = SelectDevice()
+                        , const size_t num_instances = 1 );
+
+  /// \brief Cuda device architecture of the selected device.
+  ///
+  /// This matches the __CUDA_ARCH__ specification.
+  static size_type device_arch();
+
+  //! Query device count.
+  static size_type detect_device_count();
+
+  /** \brief  Detect the available devices and their architecture
+   *          as defined by the __CUDA_ARCH__ specification.
+   */
+  static std::vector<unsigned> detect_device_arch();
+
+  cudaStream_t cuda_stream() const { return m_stream ; }
+  int          cuda_device() const { return m_device ; }
+
+  //@}
+  //--------------------------------------------------------------------------
+
+  static const char* name();
+
+private:
+
+  int          m_device ;
+  cudaStream_t m_stream ;
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct MemorySpaceAccess
+  < Kokkos::CudaSpace
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  enum { assignable = false };
+  enum { accessible = true };
+  enum { deepcopy   = false };
+};
+
+#if defined( KOKKOS_ENABLE_CUDA_UVM )
+
+// If forcing use of UVM everywhere
+// then must assume that CudaUVMSpace
+// can be a stand-in for CudaSpace.
+// This will fail when a strange host-side execution space
+// that defines CudaUVMSpace as its preferredmemory space.
+
+template<>
+struct MemorySpaceAccess
+  < Kokkos::CudaUVMSpace
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  enum { assignable = false };
+  enum { accessible = true };
+  enum { deepcopy   = false };
+};
+
+#endif
+
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::CudaSpace
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::HostSpace
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  enum { value = false };
+  inline static void verify( void ) { CudaSpace::access_error(); }
+  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_View.hpp>
+#include <Cuda/Kokkos_Cuda_Team.hpp>
+#include <Cuda/Kokkos_Cuda_Parallel.hpp>
+#include <Cuda/Kokkos_Cuda_Task.hpp>
+#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDA_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f537dad2c4a28e3c878e34c604c07d3dca6de74b
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -0,0 +1,945 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDASPACE_HPP
+#define KOKKOS_CUDASPACE_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_CUDA )
+
+#include <Kokkos_Core_fwd.hpp>
+
+#include <iosfwd>
+#include <typeinfo>
+#include <string>
+
+#include <Kokkos_HostSpace.hpp>
+
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Cuda on-device memory management */
+
+class CudaSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef CudaSpace             memory_space ;
+  typedef Kokkos::Cuda          execution_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef unsigned int          size_type ;
+
+  /*--------------------------------*/
+
+  CudaSpace();
+  CudaSpace( CudaSpace && rhs ) = default ;
+  CudaSpace( const CudaSpace & rhs ) = default ;
+  CudaSpace & operator = ( CudaSpace && rhs ) = default ;
+  CudaSpace & operator = ( const CudaSpace & rhs ) = default ;
+  ~CudaSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the cuda space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the cuda space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /**\brief Return Name of the MemorySpace */
+  static constexpr const char* name() { return m_name; }
+
+  /*--------------------------------*/
+  /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
+  static void access_error();
+  static void access_error( const void * const );
+
+private:
+
+  int  m_device ; ///< Which Cuda device
+
+  static constexpr const char* m_name = "Cuda";
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ;
+};
+
+namespace Impl {
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_arrays_cuda_space();
+
+/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* atomic_lock_array_cuda_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
+///
+/// Team and Thread private scratch allocations in
+/// global memory are aquired via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* scratch_lock_array_cuda_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for unique identifiers.
+///
+/// Unique identifiers in the range 0-Cuda::concurrency
+/// are provided via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* threadid_lock_array_cuda_space_ptr(bool deallocate = false);
+}
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Cuda memory that is accessible to Host execution space
+ *          through Cuda's unified virtual memory (UVM) runtime.
+ */
+class CudaUVMSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef CudaUVMSpace          memory_space ;
+  typedef Cuda                  execution_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef unsigned int          size_type ;
+
+  /** \brief  If UVM capability is available */
+  static bool available();
+
+
+  /*--------------------------------*/
+  /** \brief  CudaUVMSpace specific routine */
+  static int number_of_allocations();
+
+  /*--------------------------------*/
+
+
+  /*--------------------------------*/
+
+  CudaUVMSpace();
+  CudaUVMSpace( CudaUVMSpace && rhs ) = default ;
+  CudaUVMSpace( const CudaUVMSpace & rhs ) = default ;
+  CudaUVMSpace & operator = ( CudaUVMSpace && rhs ) = default ;
+  CudaUVMSpace & operator = ( const CudaUVMSpace & rhs ) = default ;
+  ~CudaUVMSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the cuda space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the cuda space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /**\brief Return Name of the MemorySpace */
+  static constexpr const char* name() { return m_name; }
+
+  /*--------------------------------*/
+
+private:
+  int  m_device ; ///< Which Cuda device
+
+  static constexpr const char* m_name = "CudaUVM";
+
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Host memory that is accessible to Cuda execution space
+ *          through Cuda's host-pinned memory allocation.
+ */
+class CudaHostPinnedSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  /** \brief  Memory is in HostSpace so use the HostSpace::execution_space */
+  typedef HostSpace::execution_space  execution_space ;
+  typedef CudaHostPinnedSpace         memory_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef unsigned int                size_type ;
+
+  /*--------------------------------*/
+
+  CudaHostPinnedSpace();
+  CudaHostPinnedSpace( CudaHostPinnedSpace && rhs ) = default ;
+  CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ;
+  CudaHostPinnedSpace & operator = ( CudaHostPinnedSpace && rhs ) = default ;
+  CudaHostPinnedSpace & operator = ( const CudaHostPinnedSpace & rhs ) = default ;
+  ~CudaHostPinnedSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /**\brief Return Name of the MemorySpace */
+  static constexpr const char* name() { return m_name; }
+
+private:
+
+  static constexpr const char* m_name = "CudaHostPinned";
+
+  /*--------------------------------*/
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaSpace >::assignable , "" );
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaUVMSpace >::assignable , "" );
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" );
+
+//----------------------------------------
+
+template<>
+struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaSpace > {
+  enum { assignable = false };
+  enum { accessible = false };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaUVMSpace > {
+  // HostSpace::execution_space != CudaUVMSpace::execution_space
+  enum { assignable = false };
+  enum { accessible = true };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace > {
+  // HostSpace::execution_space == CudaHostPinnedSpace::execution_space
+  enum { assignable = true };
+  enum { accessible = true };
+  enum { deepcopy   = true };
+};
+
+//----------------------------------------
+
+template<>
+struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::HostSpace > {
+  enum { assignable = false };
+  enum { accessible = false };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaUVMSpace > {
+  // CudaSpace::execution_space == CudaUVMSpace::execution_space
+  enum { assignable = true };
+  enum { accessible = true };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace > {
+  // CudaSpace::execution_space != CudaHostPinnedSpace::execution_space
+  enum { assignable = false };
+  enum { accessible = true }; // CudaSpace::execution_space
+  enum { deepcopy   = true };
+};
+
+//----------------------------------------
+// CudaUVMSpace::execution_space == Cuda
+// CudaUVMSpace accessible to both Cuda and Host
+
+template<>
+struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::HostSpace > {
+  enum { assignable = false };
+  enum { accessible = false }; // Cuda cannot access HostSpace
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaSpace > {
+  // CudaUVMSpace::execution_space == CudaSpace::execution_space
+  // Can access CudaUVMSpace from Host but cannot access CudaSpace from Host
+  enum { assignable = false };
+
+  // CudaUVMSpace::execution_space can access CudaSpace
+  enum { accessible = true };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaHostPinnedSpace > {
+  // CudaUVMSpace::execution_space != CudaHostPinnedSpace::execution_space
+  enum { assignable = false };
+  enum { accessible = true }; // CudaUVMSpace::execution_space
+  enum { deepcopy   = true };
+};
+
+
+//----------------------------------------
+// CudaHostPinnedSpace::execution_space == HostSpace::execution_space
+// CudaHostPinnedSpace accessible to both Cuda and Host
+
+template<>
+struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace > {
+  enum { assignable = false }; // Cannot access from Cuda
+  enum { accessible = true };  // CudaHostPinnedSpace::execution_space
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaSpace > {
+  enum { assignable = false }; // Cannot access from Host
+  enum { accessible = false };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaUVMSpace > {
+  enum { assignable = false }; // different execution_space
+  enum { accessible = true };  // same accessibility
+  enum { deepcopy   = true };
+};
+
+//----------------------------------------
+
+}} // namespace Kokkos::Impl
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+void DeepCopyAsyncCuda( void * dst , const void * src , size_t n);
+
+template<> struct DeepCopy< CudaSpace , CudaSpace , Cuda>
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< CudaSpace , HostSpace , Cuda >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< HostSpace , CudaSpace , Cuda >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaSpace , CudaSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaSpace , HostSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace , Cuda>( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< HostSpace , CudaSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< CudaSpace , CudaUVMSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< CudaSpace , CudaHostPinnedSpace , ExecutionSpace>
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+
+template<class ExecutionSpace>
+struct DeepCopy< CudaUVMSpace , CudaSpace , ExecutionSpace>
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< CudaUVMSpace , CudaUVMSpace , ExecutionSpace>
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< CudaUVMSpace , CudaHostPinnedSpace , ExecutionSpace>
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaUVMSpace , HostSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+
+template<class ExecutionSpace> struct DeepCopy< CudaHostPinnedSpace , CudaSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaHostPinnedSpace , CudaUVMSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaHostPinnedSpace , CudaHostPinnedSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaHostPinnedSpace , HostSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+
+template<class ExecutionSpace> struct DeepCopy< HostSpace , CudaUVMSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< HostSpace , CudaHostPinnedSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** Running in CudaSpace attempting to access HostSpace: error */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::HostSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
+};
+
+/** Running in CudaSpace accessing CudaUVMSpace: ok */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaUVMSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+/** Running in CudaSpace accessing CudaHostPinnedSpace: ok */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+/** Running in CudaSpace attempting to access an unknown space: error */
+template< class OtherSpace >
+struct VerifyExecutionCanAccessMemorySpace<
+  typename enable_if< ! is_same<Kokkos::CudaSpace,OtherSpace>::value , Kokkos::CudaSpace >::type ,
+  OtherSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
+};
+
+//----------------------------------------------------------------------------
+/** Running in HostSpace attempting to access CudaSpace */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaSpace >
+{
+  enum { value = false };
+  inline static void verify( void ) { CudaSpace::access_error(); }
+  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
+};
+
+/** Running in HostSpace accessing CudaUVMSpace is OK */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaUVMSpace >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+/** Running in HostSpace accessing CudaHostPinnedSpace is OK */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) {}
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  friend class SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static ::cudaTextureObject_t
+  attach_texture_object( const unsigned sizeof_alias
+                       , void * const   alloc_ptr
+                       , const size_t   alloc_size );
+
+  static RecordBase s_root_record ;
+
+  ::cudaTextureObject_t   m_tex_obj ;
+  const Kokkos::CudaSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaSpace &  arg_space
+                                          , const std::string       &  arg_label
+                                          , const size_t               arg_alloc_size );
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::CudaSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  template< typename AliasType >
+  inline
+  ::cudaTextureObject_t attach_texture_object()
+    {
+      static_assert( ( std::is_same< AliasType , int >::value ||
+                       std::is_same< AliasType , ::int2 >::value ||
+                       std::is_same< AliasType , ::int4 >::value )
+                   , "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" );
+
+      if ( m_tex_obj == 0 ) {
+        m_tex_obj = attach_texture_object( sizeof(AliasType)
+                                         , (void*) RecordBase::m_alloc_ptr
+                                         , RecordBase::m_alloc_size );
+      }
+
+      return m_tex_obj ;
+    }
+
+  template< typename AliasType >
+  inline
+  int attach_texture_object_offset( const AliasType * const ptr )
+    {
+      // Texture object is attached to the entire allocation range
+      return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
+    }
+
+  static void print_records( std::ostream & , const Kokkos::CudaSpace & , bool detail = false );
+};
+
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaUVMSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  ::cudaTextureObject_t      m_tex_obj ;
+  const Kokkos::CudaUVMSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaUVMSpace     & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaUVMSpace &  arg_space
+                                          , const std::string          &  arg_label
+                                          , const size_t                  arg_alloc_size
+                                          );
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::CudaUVMSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+
+  template< typename AliasType >
+  inline
+  ::cudaTextureObject_t attach_texture_object()
+    {
+      static_assert( ( std::is_same< AliasType , int >::value ||
+                       std::is_same< AliasType , ::int2 >::value ||
+                       std::is_same< AliasType , ::int4 >::value )
+                   , "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" );
+
+      if ( m_tex_obj == 0 ) {
+        m_tex_obj = SharedAllocationRecord< Kokkos::CudaSpace , void >::
+          attach_texture_object( sizeof(AliasType)
+                               , (void*) RecordBase::m_alloc_ptr
+                               , RecordBase::m_alloc_size );
+      }
+
+      return m_tex_obj ;
+    }
+
+  template< typename AliasType >
+  inline
+  int attach_texture_object_offset( const AliasType * const ptr )
+    {
+      // Texture object is attached to the entire allocation range
+      return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
+    }
+
+  static void print_records( std::ostream & , const Kokkos::CudaUVMSpace & , bool detail = false );
+};
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  const Kokkos::CudaHostPinnedSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace     & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaHostPinnedSpace &  arg_space
+                                          , const std::string          &  arg_label
+                                          , const size_t                  arg_alloc_size
+                                          );
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::CudaHostPinnedSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::CudaHostPinnedSpace & , bool detail = false );
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
+#endif /* #define KOKKOS_CUDASPACE_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c9cc4b510cee035f8cd6a387ab7a8b93c1726eb
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -0,0 +1,820 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXECPOLICY_HPP
+#define KOKKOS_EXECPOLICY_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_AnalyzePolicy.hpp>
+#include <Kokkos_Concepts.hpp>
+#include <iostream>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+struct ChunkSize {
+  int value;
+  ChunkSize(int value_):value(value_) {}
+};
+
+/** \brief  Execution policy for work over a range of an integral type.
+ *
+ * Valid template argument options:
+ *
+ *  With a specified execution space:
+ *    < ExecSpace , WorkTag , { IntConst | IntType } >
+ *    < ExecSpace , WorkTag , void >
+ *    < ExecSpace , { IntConst | IntType } , void >
+ *    < ExecSpace , void , void >
+ *
+ *  With the default execution space:
+ *    < WorkTag , { IntConst | IntType } , void >
+ *    < WorkTag , void , void >
+ *    < { IntConst | IntType } , void , void >
+ *    < void , void , void >
+ *
+ *  IntType  is a fundamental integral type
+ *  IntConst is an Impl::integral_constant< IntType , Blocking >
+ *
+ *  Blocking is the granularity of partitioning the range among threads.
+ */
+template<class ... Properties>
+class RangePolicy
+  : public Impl::PolicyTraits<Properties ... >
+{
+private:
+  typedef Impl::PolicyTraits<Properties ... > traits;
+
+  typename traits::execution_space m_space ;
+  typename traits::index_type  m_begin ;
+  typename traits::index_type  m_end ;
+  typename traits::index_type  m_granularity ;
+  typename traits::index_type  m_granularity_mask ;
+
+public:
+  //! Tag this class as an execution policy
+  typedef RangePolicy execution_policy;
+  typedef typename traits::index_type member_type ;
+
+  KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; }
+  KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
+  KOKKOS_INLINE_FUNCTION member_type end()   const { return m_end ; }
+
+  //TODO: find a better workaround for Clangs weird instantiation order
+  // This thing is here because of an instantiation error, where the RangePolicy is inserted into FunctorValue Traits, which
+  // tries decltype on the operator. It tries to do this even though the first argument of parallel for clearly doesn't match.
+  void operator()(const int&) const {}
+
+  RangePolicy(const RangePolicy&) = default;
+  RangePolicy(RangePolicy&&) = default;
+
+  inline RangePolicy() : m_space(), m_begin(0), m_end(0) {}
+
+  /** \brief  Total range */
+  inline
+  RangePolicy( const typename traits::execution_space & work_space
+             , const member_type work_begin
+             , const member_type work_end
+             )
+    : m_space( work_space )
+    , m_begin( work_begin < work_end ? work_begin : 0 )
+    , m_end(   work_begin < work_end ? work_end : 0 )
+    , m_granularity(0)
+    , m_granularity_mask(0)
+    {
+      set_auto_chunk_size();
+    }
+
+  /** \brief  Total range */
+  inline
+  RangePolicy( const member_type work_begin
+             , const member_type work_end
+             )
+    : RangePolicy( typename traits::execution_space()
+                 , work_begin , work_end )
+    {
+      set_auto_chunk_size();
+    }
+
+  /** \brief  Total range */
+  template<class ... Args>
+  inline
+  RangePolicy( const typename traits::execution_space & work_space
+             , const member_type work_begin
+             , const member_type work_end
+             , Args ... args
+             )
+    : m_space( work_space )
+    , m_begin( work_begin < work_end ? work_begin : 0 )
+    , m_end(   work_begin < work_end ? work_end : 0 )
+    , m_granularity(0)
+    , m_granularity_mask(0)
+    {
+      set_auto_chunk_size();
+      set(args...);
+    }
+
+  /** \brief  Total range */
+  template<class ... Args>
+  inline
+  RangePolicy( const member_type work_begin
+             , const member_type work_end
+             , Args ... args
+             )
+    : RangePolicy( typename traits::execution_space()
+                 , work_begin , work_end )
+    {
+      set_auto_chunk_size();
+      set(args...);
+    }
+
+private:
+  inline void set() {}
+
+public:
+  template<class ... Args>
+  inline void set(Args ...) {
+    static_assert( 0 == sizeof...(Args), "Kokkos::RangePolicy: unhandled constructor arguments encountered.");
+  }
+
+  template<class ... Args>
+  inline void set(const ChunkSize& chunksize, Args ... args) {
+    m_granularity = chunksize.value;
+    m_granularity_mask = m_granularity - 1;
+  }
+
+public:
+  /** \brief return chunk_size */
+  inline member_type chunk_size() const {
+    return m_granularity;
+  }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline RangePolicy set_chunk_size(int chunk_size_) const {
+    RangePolicy p = *this;
+    p.m_granularity = chunk_size_;
+    p.m_granularity_mask = p.m_granularity - 1;
+    return p;
+  }
+
+private:
+  /** \brief finalize chunk_size if it was set to AUTO*/
+  inline void set_auto_chunk_size() {
+
+   typename traits::index_type concurrency = traits::execution_space::concurrency();
+   if( concurrency==0 ) concurrency=1;
+
+   if(m_granularity > 0) {
+     if(!Impl::is_integral_power_of_two( m_granularity ))
+       Kokkos::abort("RangePolicy blocking granularity must be power of two" );
+   }
+
+   member_type new_chunk_size = 1;
+   while(new_chunk_size*100*concurrency < m_end-m_begin)
+     new_chunk_size *= 2;
+   if(new_chunk_size < 128) {
+     new_chunk_size = 1;
+     while( (new_chunk_size*40*concurrency < m_end-m_begin ) && (new_chunk_size<128) )
+       new_chunk_size*=2;
+   }
+   m_granularity = new_chunk_size;
+   m_granularity_mask = m_granularity - 1;
+  }
+
+public:
+  /** \brief  Subrange for a partition's rank and size.
+   *
+   *  Typically used to partition a range over a group of threads.
+   */
+  struct WorkRange {
+    typedef typename RangePolicy::work_tag     work_tag ;
+    typedef typename RangePolicy::member_type  member_type ;
+
+    KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
+    KOKKOS_INLINE_FUNCTION member_type end()   const { return m_end ; }
+
+    /** \brief  Subrange for a partition's rank and size.
+     *
+     *  Typically used to partition a range over a group of threads.
+     */
+    KOKKOS_INLINE_FUNCTION
+    WorkRange( const RangePolicy & range
+             , const int part_rank
+             , const int part_size
+             )
+      : m_begin(0), m_end(0)
+      {
+        if ( part_size ) {
+
+          // Split evenly among partitions, then round up to the granularity.
+          const member_type work_part =
+            ( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size )
+              + range.m_granularity_mask ) & ~member_type(range.m_granularity_mask);
+
+          m_begin = range.begin() + work_part * part_rank ;
+          m_end   = m_begin       + work_part ;
+
+          if ( range.end() < m_begin ) m_begin = range.end() ;
+          if ( range.end() < m_end )   m_end   = range.end() ;
+        }
+      }
+
+  private:
+    member_type m_begin ;
+    member_type m_end ;
+    WorkRange();
+    WorkRange & operator = ( const WorkRange & );
+  };
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+template< class ExecSpace, class ... Properties>
+class TeamPolicyInternal: public Impl::PolicyTraits<Properties ... > {
+private:
+  typedef Impl::PolicyTraits<Properties ... > traits;
+
+public:
+
+  //----------------------------------------
+  /** \brief  Query maximum team size for a given functor.
+   *
+   *  This size takes into account execution space concurrency limitations and
+   *  scratch memory space limitations for reductions, team reduce/scan, and
+   *  team shared memory.
+   *
+   *  This function only works for single-operator functors.
+   *  With multi-operator functors it cannot be determined
+   *  which operator will be called.
+   */
+  template< class FunctorType >
+  static int team_size_max( const FunctorType & );
+
+  /** \brief  Query recommended team size for a given functor.
+   *
+   *  This size takes into account execution space concurrency limitations and
+   *  scratch memory space limitations for reductions, team reduce/scan, and
+   *  team shared memory.
+   *
+   *  This function only works for single-operator functors.
+   *  With multi-operator functors it cannot be determined
+   *  which operator will be called.
+   */
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & );
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & , const int&);
+  //----------------------------------------
+  /** \brief  Construct policy with the given instance of the execution space */
+  TeamPolicyInternal( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 );
+
+  TeamPolicyInternal( const typename traits::execution_space & , int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
+
+  /** \brief  Construct policy with the default instance of the execution space */
+  TeamPolicyInternal( int league_size_request , int team_size_request , int vector_length_request = 1 );
+
+  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
+
+/*  TeamPolicyInternal( int league_size_request , int team_size_request );
+
+  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/
+
+  /** \brief  The actual league size (number of teams) of the policy.
+   *
+   *  This may be smaller than the requested league size due to limitations
+   *  of the execution space.
+   */
+  KOKKOS_INLINE_FUNCTION int league_size() const ;
+
+  /** \brief  The actual team size (number of threads per team) of the policy.
+   *
+   *  This may be smaller than the requested team size due to limitations
+   *  of the execution space.
+   */
+  KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+  inline typename traits::index_type chunk_size() const ;
+
+  inline TeamPolicyInternal set_chunk_size(int chunk_size) const ;
+
+  /** \brief  Parallel execution of a functor calls the functor once with
+   *          each member of the execution policy.
+   */
+  struct member_type {
+
+    /** \brief  Handle to the currently executing team shared scratch memory */
+    KOKKOS_INLINE_FUNCTION
+    typename traits::execution_space::scratch_memory_space team_shmem() const ;
+
+    /** \brief  Rank of this team within the league of teams */
+    KOKKOS_INLINE_FUNCTION int league_rank() const ;
+
+    /** \brief  Number of teams in the league */
+    KOKKOS_INLINE_FUNCTION int league_size() const ;
+
+    /** \brief  Rank of this thread within this team */
+    KOKKOS_INLINE_FUNCTION int team_rank() const ;
+
+    /** \brief  Number of threads in this team */
+    KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+    /** \brief  Barrier among the threads of this team */
+    KOKKOS_INLINE_FUNCTION void team_barrier() const ;
+
+    /** \brief  Intra-team reduction. Returns join of all values of the team members. */
+    template< class JoinOp >
+    KOKKOS_INLINE_FUNCTION
+    typename JoinOp::value_type team_reduce( const typename JoinOp::value_type
+                                           , const JoinOp & ) const ;
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+     *
+     *  The highest rank thread can compute the reduction total as
+     *    reduction_total = dev.team_scan( value ) + value ;
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const ;
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+     *          with intra-team non-deterministic ordering accumulation.
+     *
+     *  The global inter-team accumulation value will, at the end of the
+     *  league's parallel execution, be the scan's total.
+     *  Parallel execution ordering of the league's teams is non-deterministic.
+     *  As such the base value for each team's scan operation is similarly
+     *  non-deterministic.
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const ;
+  };
+};
+
+
+  struct PerTeamValue {
+    int value;
+    PerTeamValue(int arg);
+  };
+
+  struct PerThreadValue {
+    int value;
+    PerThreadValue(int arg);
+  };
+
+  template<class iType, class ... Args>
+  struct ExtractVectorLength {
+    static inline iType value(typename std::enable_if<std::is_integral<iType>::value,iType>::type val, Args...) {
+      return val;
+    }
+    static inline typename std::enable_if<!std::is_integral<iType>::value,int>::type value(typename std::enable_if<!std::is_integral<iType>::value,iType>::type, Args...) {
+      return 1;
+    }
+  };
+
+  template<class iType, class ... Args>
+  inline typename std::enable_if<std::is_integral<iType>::value,iType>::type extract_vector_length(iType val, Args...) {
+    return val;
+  }
+
+  template<class iType, class ... Args>
+  inline typename std::enable_if<!std::is_integral<iType>::value,int>::type extract_vector_length(iType, Args...) {
+    return 1;
+  }
+
+}
+
+Impl::PerTeamValue PerTeam(const int& arg);
+Impl::PerThreadValue PerThread(const int& arg);
+
+struct ScratchRequest {
+  int level;
+
+  int per_team;
+  int per_thread;
+
+  inline
+  ScratchRequest(const int& level_, const Impl::PerTeamValue& team_value) {
+    level = level_;
+    per_team = team_value.value;
+    per_thread = 0;
+  }
+
+  inline
+  ScratchRequest(const int& level_, const Impl::PerThreadValue& thread_value) {
+    level = level_;
+    per_team = 0;
+    per_thread = thread_value.value;;
+  }
+
+  inline
+  ScratchRequest(const int& level_, const Impl::PerTeamValue& team_value, const Impl::PerThreadValue& thread_value) {
+    level = level_;
+    per_team = team_value.value;
+    per_thread = thread_value.value;;
+  }
+
+  inline
+  ScratchRequest(const int& level_, const Impl::PerThreadValue& thread_value, const Impl::PerTeamValue& team_value) {
+    level = level_;
+    per_team = team_value.value;
+    per_thread = thread_value.value;;
+  }
+
+};
+
+
+/** \brief  Execution policy for parallel work over a league of teams of threads.
+ *
+ *  The work functor is called for each thread of each team such that
+ *  the team's member threads are guaranteed to be concurrent.
+ *
+ *  The team's threads have access to team shared scratch memory and
+ *  team collective operations.
+ *
+ *  If the WorkTag is non-void then the first calling argument of the
+ *  work functor's parentheses operator is 'const WorkTag &'.
+ *  This allows a functor to have multiple work member functions.
+ *
+ *  Order of template arguments does not matter, since the implementation
+ *  uses variadic templates. Each and any of the template arguments can
+ *  be omitted.
+ *
+ *  Possible Template arguments and their default values:
+ *    ExecutionSpace (DefaultExecutionSpace): where to execute code. Must be enabled.
+ *    WorkTag (none): Tag which is used as the first argument for the functor operator.
+ *    Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static).
+ *    IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space.
+ *    LaunchBounds<unsigned,unsigned> Launch Bounds for CUDA compilation,
+ *    default of LaunchBounds<0,0> indicates no launch bounds specified.
+ */
+template< class ... Properties>
+class TeamPolicy: public
+  Impl::TeamPolicyInternal<
+     typename Impl::PolicyTraits<Properties ... >::execution_space,
+     Properties ...> {
+  typedef Impl::TeamPolicyInternal<
+       typename Impl::PolicyTraits<Properties ... >::execution_space,
+       Properties ...> internal_policy;
+
+  typedef Impl::PolicyTraits<Properties ... > traits;
+
+public:
+  typedef TeamPolicy execution_policy;
+
+  TeamPolicy& operator = (const TeamPolicy&) = default;
+
+  /** \brief  Construct policy with the given instance of the execution space */
+  TeamPolicy( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
+    : internal_policy(typename traits::execution_space(),league_size_request,team_size_request, vector_length_request) {first_arg = false;}
+
+  TeamPolicy( const typename traits::execution_space & , int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 )
+    : internal_policy(typename traits::execution_space(),league_size_request,Kokkos::AUTO(), vector_length_request) {first_arg = false;}
+
+  /** \brief  Construct policy with the default instance of the execution space */
+  TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
+    : internal_policy(league_size_request,team_size_request, vector_length_request) {first_arg = false;}
+
+  TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 )
+    : internal_policy(league_size_request,Kokkos::AUTO(), vector_length_request) {first_arg = false;}
+
+  /** \brief  Construct policy with the given instance of the execution space */
+  template<class ... Args>
+  TeamPolicy( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request,
+              Args ... args)
+    : internal_policy(typename traits::execution_space(),league_size_request,team_size_request, vector_length_request) {
+    first_arg = false;
+    set(args...);
+  }
+
+  template<class ... Args>
+  TeamPolicy( const typename traits::execution_space & , int league_size_request , const Kokkos::AUTO_t & , int vector_length_request ,
+              Args ... args)
+    : internal_policy(typename traits::execution_space(),league_size_request,Kokkos::AUTO(), vector_length_request) {
+    first_arg = false;
+    set(args...);
+  }
+
+  /** \brief  Construct policy with the default instance of the execution space */
+  template<class ... Args>
+  TeamPolicy( int league_size_request , int team_size_request , int vector_length_request ,
+              Args ... args)
+    : internal_policy(league_size_request,team_size_request, vector_length_request) {
+    first_arg = false;
+    set(args...);
+  }
+
+  template<class ... Args>
+  TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request ,
+              Args ... args)
+    : internal_policy(league_size_request,Kokkos::AUTO(), vector_length_request) {
+    first_arg = false;
+    set(args...);
+  }
+
+  /** \brief  Construct policy with the given instance of the execution space */
+  template<class ... Args>
+  TeamPolicy( const typename traits::execution_space & , int league_size_request , int team_size_request ,
+              Args ... args)
+    : internal_policy(typename traits::execution_space(),league_size_request,team_size_request,
+                      Kokkos::Impl::extract_vector_length<Args...>(args...)) {
+    first_arg = true;
+    set(args...);
+  }
+
+  template<class ... Args>
+  TeamPolicy( const typename traits::execution_space & , int league_size_request , const Kokkos::AUTO_t & ,
+              Args ... args)
+    : internal_policy(typename traits::execution_space(),league_size_request,Kokkos::AUTO(),
+                      Kokkos::Impl::extract_vector_length<Args...>(args...)) {
+    first_arg = true;
+    set(args...);
+  }
+
+  /** \brief  Construct policy with the default instance of the execution space */
+  template<class ... Args>
+  TeamPolicy( int league_size_request , int team_size_request ,
+              Args ... args)
+    : internal_policy(league_size_request,team_size_request,
+                      Kokkos::Impl::extract_vector_length<Args...>(args...)) {
+    first_arg = true;
+    set(args...);
+  }
+
+  template<class ... Args>
+  TeamPolicy( int league_size_request , const Kokkos::AUTO_t & ,
+              Args ... args)
+    : internal_policy(league_size_request,Kokkos::AUTO(),
+                      Kokkos::Impl::extract_vector_length<Args...>(args...)) {
+    first_arg = true;
+    set(args...);
+  }
+
+private:
+  bool first_arg;
+  TeamPolicy(const internal_policy& p):internal_policy(p) {first_arg = false;}
+
+  inline void set() {}
+
+public:
+  template<class ... Args>
+  inline void set(Args ...) {
+    static_assert( 0 == sizeof...(Args), "Kokkos::TeamPolicy: unhandled constructor arguments encountered.");
+  }
+
+  template<class iType, class ... Args>
+  inline typename std::enable_if<std::is_integral<iType>::value>::type set(iType, Args ... args) {
+    if(first_arg) {
+      first_arg = false;
+      set(args...);
+    } else {
+      first_arg = false;
+      Kokkos::Impl::throw_runtime_exception("Kokkos::TeamPolicy: integer argument to constructor in illegal place.");
+    }
+  }
+
+  template<class ... Args>
+  inline void set(const ChunkSize& chunksize, Args ... args) {
+    first_arg = false;
+    internal_policy::internal_set_chunk_size(chunksize.value);
+    set(args...);
+  }
+
+  template<class ... Args>
+  inline void set(const ScratchRequest& scr_request, Args ... args) {
+    first_arg = false;
+    internal_policy::internal_set_scratch_size(scr_request.level,Impl::PerTeamValue(scr_request.per_team),
+        Impl::PerThreadValue(scr_request.per_thread));
+    set(args...);
+  }
+
+  inline TeamPolicy set_chunk_size(int chunk) const {
+    return TeamPolicy(internal_policy::set_chunk_size(chunk));
+  };
+
+  inline TeamPolicy set_scratch_size(const int& level, const Impl::PerTeamValue& per_team) const {
+    return TeamPolicy(internal_policy::set_scratch_size(level,per_team));
+  };
+  inline TeamPolicy set_scratch_size(const int& level, const Impl::PerThreadValue& per_thread) const {
+    return TeamPolicy(internal_policy::set_scratch_size(level,per_thread));
+  };
+  inline TeamPolicy set_scratch_size(const int& level, const Impl::PerTeamValue& per_team, const Impl::PerThreadValue& per_thread) const {
+    return TeamPolicy(internal_policy::set_scratch_size(level, per_team, per_thread));
+  };
+  inline TeamPolicy set_scratch_size(const int& level, const Impl::PerThreadValue& per_thread, const Impl::PerTeamValue& per_team) const {
+    return TeamPolicy(internal_policy::set_scratch_size(level, per_team, per_thread));
+  };
+
+};
+
+namespace Impl {
+
+template<typename iType, class TeamMemberType>
+struct TeamThreadRangeBoundariesStruct {
+private:
+
+  KOKKOS_INLINE_FUNCTION static
+  iType ibegin( const iType & arg_begin
+              , const iType & arg_end
+              , const iType & arg_rank
+              , const iType & arg_size
+              )
+    {
+      return arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * arg_rank ;
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  iType iend( const iType & arg_begin
+            , const iType & arg_end
+            , const iType & arg_rank
+            , const iType & arg_size
+            )
+    {
+      const iType end_ = arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * ( arg_rank + 1 );
+      return end_ < arg_end ? end_ : arg_end ;
+    }
+
+public:
+
+  typedef iType index_type;
+  const iType start;
+  const iType end;
+  enum {increment = 1};
+  const TeamMemberType& thread;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
+                                 , const iType& arg_end
+                                 )
+    : start( ibegin( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , end(   iend(   0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , thread( arg_thread )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
+                                , const iType& arg_begin
+                                , const iType& arg_end
+                                )
+    : start( ibegin( arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , end(   iend(   arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , thread( arg_thread )
+    {}
+};
+
+template<typename iType, class TeamMemberType>
+struct ThreadVectorRangeBoundariesStruct {
+  typedef iType index_type;
+  enum {start = 0};
+  const iType end;
+  enum {increment = 1};
+
+  KOKKOS_INLINE_FUNCTION
+  ThreadVectorRangeBoundariesStruct ( const TeamMemberType, const iType& count ) : end( count ) {}
+  KOKKOS_INLINE_FUNCTION
+  ThreadVectorRangeBoundariesStruct ( const iType& count ) : end( count ) {}
+};
+
+template<class TeamMemberType>
+struct ThreadSingleStruct {
+  const TeamMemberType& team_member;
+  KOKKOS_INLINE_FUNCTION
+  ThreadSingleStruct( const TeamMemberType& team_member_ ) : team_member( team_member_ ) {}
+};
+
+template<class TeamMemberType>
+struct VectorSingleStruct {
+  const TeamMemberType& team_member;
+  KOKKOS_INLINE_FUNCTION
+  VectorSingleStruct( const TeamMemberType& team_member_ ) : team_member( team_member_ ) {}
+};
+
+} // namespace Impl
+
+/** \brief  Execution policy for parallel work over a threads within a team.
+ *
+ *  The range is split over all threads in a team. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a single count. So the range is (0,count].
+ */
+template<typename iType, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType>
+TeamThreadRange( const TeamMemberType&, const iType& count );
+
+/** \brief  Execution policy for parallel work over a threads within a team.
+ *
+ *  The range is split over all threads in a team. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a begin and end. So the range is (begin,end].
+ */
+template<typename iType1, typename iType2, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<typename std::common_type<iType1, iType2>::type, TeamMemberType>
+TeamThreadRange( const TeamMemberType&, const iType1& begin, const iType2& end );
+
+/** \brief  Execution policy for a vector parallel loop.
+ *
+ *  The range is split over all vector lanes in a thread. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a single count. So the range is (0,count].
+ */
+template<typename iType, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType>
+ThreadVectorRange( const TeamMemberType&, const iType& count );
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+namespace Impl {
+
+template<typename FunctorType, typename TagType,
+  bool HasTag = !std::is_same<TagType, void>::value >
+struct ParallelConstructName;
+
+template<typename FunctorType, typename TagType>
+struct ParallelConstructName<FunctorType, TagType, true> {
+  ParallelConstructName(std::string const& label):label_ref(label) {
+    if (label.empty()) {
+      default_name = std::string(typeid(FunctorType).name()) + "/" +
+        typeid(TagType).name();
+    }
+  }
+  std::string const& get() {
+    return (label_ref.empty()) ? default_name : label_ref;
+  }
+  std::string const& label_ref;
+  std::string default_name;
+};
+
+template<typename FunctorType, typename TagType>
+struct ParallelConstructName<FunctorType, TagType, false> {
+  ParallelConstructName(std::string const& label):label_ref(label) {
+    if (label.empty()) {
+      default_name = std::string(typeid(FunctorType).name());
+    }
+  }
+  std::string const& get() {
+    return (label_ref.empty()) ? default_name : label_ref;
+  }
+  std::string const& label_ref;
+  std::string default_name;
+};
+
+} // namespace Impl
+#endif /* defined KOKKOS_ENABLE_PROFILING */
+
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_EXECPOLICY_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7c65f36a304f700c4d74cfe52f1d841d01e438d
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -0,0 +1,342 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HBWSPACE_HPP
+#define KOKKOS_HBWSPACE_HPP
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_HBWSPACE
+
+#include <Kokkos_HostSpace.hpp>
+
+namespace Kokkos {
+
+namespace Experimental {
+
+namespace Impl {
+
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_array_hbw_space();
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+bool lock_address_hbw_space( void* ptr );
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+void unlock_address_hbw_space( void* ptr );
+
+} // namespace Impl
+
+} // namespace Experimental
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Experimental {
+
+/// \class HBWSpace
+/// \brief Memory management for host memory.
+///
+/// HBWSpace is a memory space that governs host memory.  "Host"
+/// memory means the usual CPU-accessible memory.
+class HBWSpace {
+public:
+  //! Tag this class as a kokkos memory space
+  typedef HBWSpace  memory_space;
+  typedef size_t     size_type;
+
+  /// \typedef execution_space
+  /// \brief Default execution space for this memory space.
+  ///
+  /// Every memory space has a default execution space.  This is
+  /// useful for things like initializing a View (which happens in
+  /// parallel using the View's default execution space).
+#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef Kokkos::OpenMP    execution_space;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
+#elif defined( KOKKOS_ENABLE_OPENMP )
+  typedef Kokkos::OpenMP    execution_space;
+#elif defined( KOKKOS_ENABLE_THREADS )
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
+#elif defined( KOKKOS_ENABLE_SERIAL )
+  typedef Kokkos::Serial    execution_space;
+#else
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qhreads, or Kokkos::Serial.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#endif
+
+  //! This memory space preferred device_type
+  typedef Kokkos::Device< execution_space, memory_space > device_type;
+
+  /**\brief  Default memory space instance */
+  HBWSpace();
+  HBWSpace( const HBWSpace & rhs ) = default;
+  HBWSpace & operator = ( const HBWSpace & ) = default;
+  ~HBWSpace() = default;
+
+  /**\brief  Non-default memory space instance to choose allocation mechansim, if available */
+
+  enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };
+
+  explicit
+  HBWSpace( const AllocationMechanism & );
+
+  /**\brief  Allocate untracked memory in the space */
+  void * allocate( const size_t arg_alloc_size ) const;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const;
+
+  /**\brief Return Name of the MemorySpace */
+  static constexpr const char* name() { return "HBW"; }
+
+private:
+
+  AllocationMechanism  m_alloc_mech;
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
+};
+
+} // namespace Experimental
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >
+  : public SharedAllocationRecord< void, void >
+{
+private:
+
+  friend Kokkos::Experimental::HBWSpace;
+
+  typedef SharedAllocationRecord< void, void >  RecordBase;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;
+
+  static void deallocate( RecordBase * );
+
+  /**\brief  Root record for tracked allocations from this HBWSpace instance */
+  static RecordBase s_root_record;
+
+  const Kokkos::Experimental::HBWSpace m_space;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() = default;
+
+  SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
+                        , const std::string                    & arg_label
+                        , const size_t                           arg_alloc_size
+                        , const RecordBase::function_type        arg_dealloc = & deallocate
+                        );
+
+public:
+
+  inline
+  std::string get_label() const
+    {
+      return std::string( RecordBase::head()->m_label );
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const Kokkos::Experimental::HBWSpace & arg_space
+                                   , const std::string                    & arg_label
+                                   , const size_t                           arg_alloc_size
+                                   )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
+#else
+      return (SharedAllocationRecord *) 0;
+#endif
+    }
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
+                         , const std::string                    & arg_label
+                         , const size_t                           arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream &, const Kokkos::Experimental::HBWSpace &, bool detail = false );
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace >::assignable, "" );
+
+template<>
+struct MemorySpaceAccess< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace > {
+  enum { assignable = true };
+  enum { accessible = true };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace > {
+  enum { assignable = false };
+  enum { accessible = true };
+  enum { deepcopy   = true };
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+template< class ExecutionSpace >
+struct DeepCopy< Experimental::HBWSpace, Experimental::HBWSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
+  }
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
+    exec.fence();
+    memcpy( dst, src, n );
+  }
+};
+
+template< class ExecutionSpace >
+struct DeepCopy< HostSpace, Experimental::HBWSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
+  }
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
+    exec.fence();
+    memcpy( dst, src, n );
+  }
+};
+
+template< class ExecutionSpace >
+struct DeepCopy< Experimental::HBWSpace, HostSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
+  }
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
+    exec.fence();
+    memcpy( dst, src, n );
+  }
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#endif
+#endif // #define KOKKOS_HBWSPACE_HPP
+
diff --git a/packages/kokkos/core/src/Kokkos_HostSpace.hpp b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c84146ecf5e127936ef3fcd21499562eef26476f
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -0,0 +1,311 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HOSTSPACE_HPP
+#define KOKKOS_HOSTSPACE_HPP
+
+#include <cstring>
+#include <string>
+#include <iosfwd>
+#include <typeinfo>
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Concepts.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_SharedAlloc.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+namespace Impl {
+
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_array_host_space();
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+bool lock_address_host_space(void* ptr);
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+void unlock_address_host_space( void* ptr );
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+/// \class HostSpace
+/// \brief Memory management for host memory.
+///
+/// HostSpace is a memory space that governs host memory.  "Host"
+/// memory means the usual CPU-accessible memory.
+class HostSpace {
+public:
+  //! Tag this class as a kokkos memory space
+  typedef HostSpace  memory_space;
+  typedef size_t     size_type;
+
+  /// \typedef execution_space
+  /// \brief Default execution space for this memory space.
+  ///
+  /// Every memory space has a default execution space.  This is
+  /// useful for things like initializing a View (which happens in
+  /// parallel using the View's default execution space).
+#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef Kokkos::OpenMP    execution_space;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
+#elif defined( KOKKOS_ENABLE_OPENMP )
+  typedef Kokkos::OpenMP    execution_space;
+#elif defined( KOKKOS_ENABLE_THREADS )
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
+#elif defined( KOKKOS_ENABLE_SERIAL )
+  typedef Kokkos::Serial    execution_space;
+#else
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#endif
+
+  //! This memory space preferred device_type
+  typedef Kokkos::Device< execution_space, memory_space > device_type;
+
+  /**\brief  Default memory space instance */
+  HostSpace();
+  HostSpace( HostSpace && rhs ) = default;
+  HostSpace( const HostSpace & rhs ) = default;
+  HostSpace & operator = ( HostSpace && ) = default;
+  HostSpace & operator = ( const HostSpace & ) = default;
+  ~HostSpace() = default;
+
+  /**\brief  Non-default memory space instance to choose allocation mechansim, if available */
+
+  enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };
+
+  explicit
+  HostSpace( const AllocationMechanism & );
+
+  /**\brief  Allocate untracked memory in the space */
+  void * allocate( const size_t arg_alloc_size ) const;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const;
+
+  /**\brief Return Name of the MemorySpace */
+  static constexpr const char* name() { return m_name; }
+
+private:
+  AllocationMechanism  m_alloc_mech;
+  static constexpr const char* m_name = "Host";
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace, void >;
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::HostSpace >::assignable, "" );
+
+template< typename S >
+struct HostMirror {
+private:
+  // If input execution space can access HostSpace then keep it.
+  // Example: Kokkos::OpenMP can access, Kokkos::Cuda cannot
+  enum { keep_exe = Kokkos::Impl::MemorySpaceAccess
+                      < typename S::execution_space::memory_space, Kokkos::HostSpace >::accessible };
+
+  // If HostSpace can access memory space then keep it.
+  // Example:  Cannot access Kokkos::CudaSpace, can access Kokkos::CudaUVMSpace
+  enum { keep_mem = Kokkos::Impl::MemorySpaceAccess
+                      < Kokkos::HostSpace, typename S::memory_space >::accessible };
+
+public:
+
+  typedef typename std::conditional
+    < keep_exe && keep_mem /* Can keep whole space */
+    , S
+    , typename std::conditional
+        < keep_mem /* Can keep memory space, use default Host execution space */
+        , Kokkos::Device< Kokkos::HostSpace::execution_space
+                        , typename S::memory_space >
+        , Kokkos::HostSpace
+        >::type
+    >::type  Space;
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::HostSpace, void >
+  : public SharedAllocationRecord< void, void >
+{
+private:
+  friend Kokkos::HostSpace;
+
+  typedef SharedAllocationRecord< void, void >  RecordBase;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;
+
+  static void deallocate( RecordBase * );
+
+  /**\brief  Root record for tracked allocations from this HostSpace instance */
+  static RecordBase s_root_record;
+
+  const Kokkos::HostSpace m_space;
+
+protected:
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() = default;
+
+  SharedAllocationRecord( const Kokkos::HostSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  inline
+  std::string get_label() const
+  {
+    return std::string( RecordBase::head()->m_label );
+  }
+
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const Kokkos::HostSpace &  arg_space
+                                   , const std::string       &  arg_label
+                                   , const size_t               arg_alloc_size
+                                   )
+  {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
+#else
+    return (SharedAllocationRecord *) 0;
+#endif
+  }
+
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::HostSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream &, const Kokkos::HostSpace &, bool detail = false );
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space > struct DeepCopy;
+
+template< class ExecutionSpace >
+struct DeepCopy< HostSpace, HostSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
+  }
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
+    exec.fence();
+    memcpy( dst, src, n );
+  }
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#endif // #define KOKKOS_HOSTSPACE_HPP
+
diff --git a/packages/kokkos/core/src/Kokkos_Layout.hpp b/packages/kokkos/core/src/Kokkos_Layout.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0f92d8cf10cec45ff5d9e7130af8d0d682aa26f
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Layout.hpp
@@ -0,0 +1,241 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Layout.hpp
+/// \brief Declaration of various \c MemoryLayout options.
+
+#ifndef KOKKOS_LAYOUT_HPP
+#define KOKKOS_LAYOUT_HPP
+
+#include <cstddef>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+namespace Kokkos {
+
+enum { ARRAY_LAYOUT_MAX_RANK = 8 };
+
+//----------------------------------------------------------------------------
+/// \struct LayoutLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.
+///
+/// "Layout left" indicates a mapping where the leftmost index i0
+/// refers to contiguous access, and strides increase for dimensions
+/// going right from there (i1, i2, ...).  This layout imitates how
+/// Fortran stores multi-dimensional arrays.  For the special case of
+/// a two-dimensional array, "layout left" is also called "column
+/// major."
+struct LayoutLeft {
+  //! Tag this class as a kokkos array layout
+  typedef LayoutLeft array_layout ;
+
+  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ];
+
+  LayoutLeft( LayoutLeft const & ) = default ;
+  LayoutLeft( LayoutLeft && ) = default ;
+  LayoutLeft & operator = ( LayoutLeft const & ) = default ;
+  LayoutLeft & operator = ( LayoutLeft && ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  explicit constexpr
+  LayoutLeft( size_t N0 = 0 , size_t N1 = 0 , size_t N2 = 0 , size_t N3 = 0
+            , size_t N4 = 0 , size_t N5 = 0 , size_t N6 = 0 , size_t N7 = 0 )
+    : dimension { N0 , N1 , N2 , N3 , N4 , N5 , N6 , N7 } {}
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutRight
+/// \brief Memory layout tag indicating right-to-left (C or
+///   lexigraphical scheme) striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.
+///
+/// "Right layout" indicates a mapping where the rightmost index ik
+/// refers to contiguous access, and strides increase for dimensions
+/// going left from there.  This layout imitates how C stores
+/// multi-dimensional arrays.  For the special case of a
+/// two-dimensional array, "layout right" is also called "row major."
+struct LayoutRight {
+  //! Tag this class as a kokkos array layout
+  typedef LayoutRight array_layout ;
+
+  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ];
+
+  LayoutRight( LayoutRight const & ) = default ;
+  LayoutRight( LayoutRight && ) = default ;
+  LayoutRight & operator = ( LayoutRight const & ) = default ;
+  LayoutRight & operator = ( LayoutRight && ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  explicit constexpr
+  LayoutRight( size_t N0 = 0 , size_t N1 = 0 , size_t N2 = 0 , size_t N3 = 0
+             , size_t N4 = 0 , size_t N5 = 0 , size_t N6 = 0 , size_t N7 = 0 )
+    : dimension { N0 , N1 , N2 , N3 , N4 , N5 , N6 , N7 } {}
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutStride
+/// \brief  Memory layout tag indicated arbitrarily strided
+///         multi-index mapping into contiguous memory.
+struct LayoutStride {
+
+  //! Tag this class as a kokkos array layout
+  typedef LayoutStride array_layout ;
+
+  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
+  size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ;
+
+  LayoutStride( LayoutStride const & ) = default ;
+  LayoutStride( LayoutStride && ) = default ;
+  LayoutStride & operator = ( LayoutStride const & ) = default ;
+  LayoutStride & operator = ( LayoutStride && ) = default ;
+
+  /** \brief  Compute strides from ordered dimensions.
+   *
+   *  Values of order uniquely form the set [0..rank)
+   *  and specify ordering of the dimensions.
+   *  Order = {0,1,2,...} is LayoutLeft
+   *  Order = {...,2,1,0} is LayoutRight
+   */
+  template< typename iTypeOrder , typename iTypeDimen >
+  KOKKOS_INLINE_FUNCTION static
+  LayoutStride order_dimensions( int const rank
+                               , iTypeOrder const * const order
+                               , iTypeDimen const * const dimen )
+    {
+      LayoutStride tmp ;
+      // Verify valid rank order:
+      int check_input = ARRAY_LAYOUT_MAX_RANK < rank ? 0 : int( 1 << rank ) - 1 ;
+      for ( int r = 0 ; r < ARRAY_LAYOUT_MAX_RANK ; ++r ) {
+        tmp.dimension[r] = 0 ;
+        tmp.stride[r]    = 0 ;
+      }
+      for ( int r = 0 ; r < rank ; ++r ) {
+        check_input &= ~int( 1 << order[r] );
+      }
+      if ( 0 == check_input ) {
+        size_t n = 1 ;
+        for ( int r = 0 ; r < rank ; ++r ) {
+          tmp.stride[ order[r] ] = n ;
+          n *= ( dimen[order[r]] );
+          tmp.dimension[r] = dimen[r];
+        }
+      }
+      return tmp ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  explicit constexpr
+  LayoutStride( size_t N0 = 0 , size_t S0 = 0
+              , size_t N1 = 0 , size_t S1 = 0
+              , size_t N2 = 0 , size_t S2 = 0
+              , size_t N3 = 0 , size_t S3 = 0
+              , size_t N4 = 0 , size_t S4 = 0
+              , size_t N5 = 0 , size_t S5 = 0
+              , size_t N6 = 0 , size_t S6 = 0
+              , size_t N7 = 0 , size_t S7 = 0
+              )
+    : dimension { N0 , N1 , N2 , N3 , N4 , N5 , N6 , N7 }
+    , stride    { S0 , S1 , S2 , S3 , S4 , S5 , S6 , S7 }
+    {}
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutTileLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices by tiles.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.
+///
+/// "Tiled layout" indicates a mapping to contiguously stored
+/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
+/// dimensions.  Indices are LayoutLeft within each tile, and the
+/// tiles themselves are arranged using LayoutLeft.  Note that the
+/// dimensions <tt>ArgN0</tt> and <tt>ArgN1</tt> of the tiles must be
+/// compile-time constants.  This speeds up index calculations.  If
+/// both tile dimensions are powers of two, Kokkos can optimize
+/// further.
+template < unsigned ArgN0 , unsigned ArgN1 ,
+           bool IsPowerOfTwo = ( Impl::is_integral_power_of_two(ArgN0) &&
+                                 Impl::is_integral_power_of_two(ArgN1) )
+         >
+struct LayoutTileLeft {
+
+  static_assert( Impl::is_integral_power_of_two(ArgN0) &&
+                 Impl::is_integral_power_of_two(ArgN1)
+               , "LayoutTileLeft must be given power-of-two tile dimensions" );
+
+  //! Tag this class as a kokkos array layout
+  typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;
+
+  enum { N0 = ArgN0 };
+  enum { N1 = ArgN1 };
+
+  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
+
+  LayoutTileLeft( LayoutTileLeft const & ) = default ;
+  LayoutTileLeft( LayoutTileLeft && ) = default ;
+  LayoutTileLeft & operator = ( LayoutTileLeft const & ) = default ;
+  LayoutTileLeft & operator = ( LayoutTileLeft && ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  explicit constexpr
+  LayoutTileLeft( size_t argN0 = 0 , size_t argN1 = 0 , size_t argN2 = 0 , size_t argN3 = 0
+                , size_t argN4 = 0 , size_t argN5 = 0 , size_t argN6 = 0 , size_t argN7 = 0
+                )
+    : dimension { argN0 , argN1 , argN2 , argN3 , argN4 , argN5 , argN6 , argN7 } {}
+};
+
+} // namespace Kokkos
+
+#endif // #ifndef KOKKOS_LAYOUT_HPP
+
diff --git a/packages/kokkos/core/src/Kokkos_Macros.hpp b/packages/kokkos/core/src/Kokkos_Macros.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..696bdcd520f5119e27876c4833b8a407ed040ce4
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Macros.hpp
@@ -0,0 +1,532 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MACROS_HPP
+#define KOKKOS_MACROS_HPP
+
+//----------------------------------------------------------------------------
+/** Pick up configure / build options via #define macros:
+ *
+ *  KOKKOS_ENABLE_CUDA                Kokkos::Cuda execution and memory spaces
+ *  KOKKOS_ENABLE_THREADS             Kokkos::Threads execution space
+ *  KOKKOS_ENABLE_QTHREADS            Kokkos::Qthreads execution space
+ *  KOKKOS_ENABLE_OPENMP              Kokkos::OpenMP execution space
+ *  KOKKOS_ENABLE_OPENMPTARGET        Kokkos::Experimental::OpenMPTarget execution space
+ *  KOKKOS_ENABLE_HWLOC               HWLOC library is available.
+ *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK  Insert array bounds checks, is expensive!
+ *  KOKKOS_ENABLE_MPI                 Negotiate MPI/execution space interactions.
+ *  KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory space.
+ */
+
+#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
+  #include <KokkosCore_config.h>
+#endif
+
+#include <impl/Kokkos_OldMacros.hpp>
+
+//----------------------------------------------------------------------------
+/** Pick up compiler specific #define macros:
+ *
+ *  Macros for known compilers evaluate to an integral version value
+ *
+ *  KOKKOS_COMPILER_NVCC
+ *  KOKKOS_COMPILER_GNU
+ *  KOKKOS_COMPILER_INTEL
+ *  KOKKOS_COMPILER_IBM
+ *  KOKKOS_COMPILER_CRAYC
+ *  KOKKOS_COMPILER_APPLECC
+ *  KOKKOS_COMPILER_CLANG
+ *  KOKKOS_COMPILER_PGI
+ *
+ *  Macros for which compiler extension to use for atomics on intrinsice types
+ *
+ *  KOKKOS_ENABLE_CUDA_ATOMICS
+ *  KOKKOS_ENABLE_GNU_ATOMICS
+ *  KOKKOS_ENABLE_INTEL_ATOMICS
+ *  KOKKOS_ENABLE_OPENMP_ATOMICS
+ *
+ *  A suite of 'KOKKOS_ENABLE_PRAGMA_...' are defined for internal use.
+ *
+ *  Macros for marking functions to run in an execution space:
+ *
+ *  KOKKOS_FUNCTION
+ *  KOKKOS_INLINE_FUNCTION        request compiler to inline
+ *  KOKKOS_FORCEINLINE_FUNCTION   force compiler to inline, use with care!
+ */
+
+//----------------------------------------------------------------------------
+
+#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_THREADS) || \
+    defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_QTHREADS) || \
+    defined(KOKKOS_ENABLE_ROCM) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+  #define KOKKOS_INTERNAL_ENABLE_NON_CUDA_BACKEND
+#endif
+
+#if !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_CUDA) && \
+    !defined(KOKKOS_ENABLE_OPENMP) && !defined(KOKKOS_ENABLE_QTHREADS) && \
+    !defined(KOKKOS_ENABLE_ROCM) && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+  #define KOKKOS_INTERNAL_NOT_PARALLEL
+#endif
+
+#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+
+#if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
+  // Compiling with a CUDA compiler.
+  //
+  //  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
+  //    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
+  //
+  //  When generating device code the __CUDA_ARCH__ macro is defined as:
+  //    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
+
+  #include <cuda_runtime.h>
+  #include <cuda.h>
+
+  #if !defined( CUDA_VERSION )
+    #error "#include <cuda.h> did not define CUDA_VERSION."
+  #endif
+
+  #if ( CUDA_VERSION < 7000 )
+    // CUDA supports C++11 in device code starting with version 7.0.
+    // This includes auto type and device code internal lambdas.
+    #error "Cuda version 7.0 or greater required."
+  #endif
+
+  #if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
+    // Compiling with CUDA compiler for device code.
+    #error "Cuda device capability >= 3.0 is required."
+  #endif
+
+  #ifdef KOKKOS_ENABLE_CUDA_LAMBDA
+    #if ( CUDA_VERSION < 7050 )
+      // CUDA supports C++11 lambdas generated in host code to be given
+      // to the device starting with version 7.5. But the release candidate (7.5.6)
+      // still identifies as 7.0.
+      #error "Cuda version 7.5 or greater required for host-to-device Lambda support."
+    #endif
+
+    #if ( CUDA_VERSION < 8000 ) && defined( __NVCC__ )
+      #define KOKKOS_LAMBDA [=]__device__
+      #if defined( KOKKOS_INTERNAL_ENABLE_NON_CUDA_BACKEND )
+        #undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+      #endif
+    #else
+      #define KOKKOS_LAMBDA [=]__host__ __device__
+
+      #if defined( KOKKOS_ENABLE_CXX1Z )
+        #define KOKKOS_CLASS_LAMBDA        [=,*this] __host__ __device__
+      #endif
+    #endif
+
+    #if defined( __NVCC__ )
+      #define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
+    #endif
+  #else // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+    #undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+  #endif // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
+
+//----------------------------------------------------------------------------
+// Language info: C++, CUDA, OPENMP
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  // Compiling Cuda code to 'ptx'
+
+  #define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
+  #define KOKKOS_INLINE_FUNCTION       __device__  __host__  inline
+  #define KOKKOS_FUNCTION              __device__  __host__
+  #ifdef KOKKOS_COMPILER_CLANG
+  #define KOKKOS_INLINE_FUNCTION_DEFAULTED KOKKOS_INLINE_FUNCTION
+  #define KOKKOS_FUNCTION_DEFAULTED KOKKOS_FUNCTION
+  #endif
+#endif // #if defined( __CUDA_ARCH__ )
+
+#if defined( KOKKOS_ENABLE_ROCM ) && defined( __HCC__ )
+
+  #define KOKKOS_FORCEINLINE_FUNCTION  __attribute__((amp,cpu)) inline
+  #define KOKKOS_INLINE_FUNCTION       __attribute__((amp,cpu)) inline
+  #define KOKKOS_FUNCTION              __attribute__((amp,cpu))
+  #define KOKKOS_LAMBDA                [=] __attribute__((amp,cpu))
+  #define KOKKOS_INLINE_FUNCTION_DEFAULTED KOKKOS_INLINE_FUNCTION
+  #define KOKKOS_FUNCTION_DEFAULTED    KOKKOS_FUNCTION
+#endif
+
+#if defined( _OPENMP )
+  //  Compiling with OpenMP.
+  //  The value of _OPENMP is an integer value YYYYMM
+  //  where YYYY and MM are the year and month designation
+  //  of the supported OpenMP API version.
+#endif // #if defined( _OPENMP )
+
+//----------------------------------------------------------------------------
+// Mapping compiler built-ins to KOKKOS_COMPILER_*** macros
+
+#if defined( __NVCC__ )
+  // NVIDIA compiler is being used.
+  // Code is parsed and separated into host and device code.
+  // Host code is compiled again with another compiler.
+  // Device code is compile to 'ptx'.
+  #define KOKKOS_COMPILER_NVCC __NVCC__
+#endif // #if defined( __NVCC__ )
+
+#if !defined( KOKKOS_LAMBDA )
+  #define KOKKOS_LAMBDA [=]
+#endif
+
+#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined( KOKKOS_CLASS_LAMBDA )
+  #define KOKKOS_CLASS_LAMBDA [=,*this]
+#endif
+
+//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'.
+
+// Intel compiler for host code.
+
+#if defined( __INTEL_COMPILER )
+  #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
+#elif defined( __ICC )
+  // Old define
+  #define KOKKOS_COMPILER_INTEL __ICC
+#elif defined( __ECC )
+  // Very old define
+  #define KOKKOS_COMPILER_INTEL __ECC
+#endif
+
+// CRAY compiler for host code
+#if defined( _CRAYC )
+  #define KOKKOS_COMPILER_CRAYC _CRAYC
+#endif
+
+#if defined( __IBMCPP__ )
+  // IBM C++
+  #define KOKKOS_COMPILER_IBM __IBMCPP__
+#elif defined( __IBMC__ )
+  #define KOKKOS_COMPILER_IBM __IBMC__
+#endif
+
+#if defined( __APPLE_CC__ )
+  #define KOKKOS_COMPILER_APPLECC __APPLE_CC__
+#endif
+
+#if defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL )
+  #define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
+#endif
+
+#if !defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
+  #define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
+
+  #if ( 472 > KOKKOS_COMPILER_GNU )
+    #error "Compiling with GCC version earlier than 4.7.2 is not supported."
+  #endif
+#endif
+
+#if defined( __PGIC__ ) 
+  #define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
+
+  #if ( 1540 > KOKKOS_COMPILER_PGI )
+    #error "Compiling with PGI version earlier than 15.4 is not supported."
+  #endif
+#endif
+
+//#endif // #if !defined( __CUDA_ARCH__ )
+
+//----------------------------------------------------------------------------
+// Intel compiler macros
+
+#if defined( KOKKOS_COMPILER_INTEL )
+  #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
+  #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_ENABLE_PRAGMA_VECTOR 1
+  #if ( 1800 > KOKKOS_COMPILER_INTEL )
+    #define KOKKOS_ENABLE_PRAGMA_SIMD 1
+  #endif
+
+  #if ( __INTEL_COMPILER > 1400 )
+    #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
+  #endif
+
+  #if ! defined( KOKKOS_MEMORY_ALIGNMENT )
+    #define KOKKOS_MEMORY_ALIGNMENT 64
+  #endif
+
+  #define KOKKOS_RESTRICT __restrict__
+
+  #ifndef KOKKOS_IMPL_ALIGN_PTR
+    #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size)))
+  #endif
+
+  #if ( 1400 > KOKKOS_COMPILER_INTEL )
+    #if ( 1300 > KOKKOS_COMPILER_INTEL )
+      #error "Compiling with Intel version earlier than 13.0 is not supported. Official minimal version is 14.0."
+    #else
+      #warning "Compiling with Intel version 13.x probably works but is not officially supported. Official minimal version is 14.0."
+    #endif
+  #endif
+
+  #if !defined( KOKKOS_ENABLE_ASM ) && !defined( _WIN32 )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
+
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #if !defined( _WIN32 )
+      #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
+    #else
+      #define KOKKOS_FORCEINLINE_FUNCTION inline
+    #endif
+  #endif
+
+  #if defined( KOKKOS_ARCH_AVX512MIC )
+      #define KOKKOS_ENABLE_RFO_PREFETCH 1
+  #endif 
+
+  #if defined( __MIC__ )
+    // Compiling for Xeon Phi
+  #endif
+#endif
+
+//----------------------------------------------------------------------------
+// Cray compiler macros
+
+#if defined( KOKKOS_COMPILER_CRAYC )
+#endif
+
+//----------------------------------------------------------------------------
+// IBM Compiler macros
+
+#if defined( KOKKOS_COMPILER_IBM )
+  #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
+  //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
+  //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
+  //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
+
+  #if ! defined( KOKKOS_ENABLE_ASM )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
+#endif
+
+//----------------------------------------------------------------------------
+// CLANG compiler macros
+
+#if defined( KOKKOS_COMPILER_CLANG )
+  //#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
+  //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
+  //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
+  //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
+
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
+  #endif
+
+  #if !defined( KOKKOS_IMPL_ALIGN_PTR )
+    #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((aligned(size)))
+  #endif
+
+#endif
+
+//----------------------------------------------------------------------------
+// GNU Compiler macros
+
+#if defined( KOKKOS_COMPILER_GNU )
+  //#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
+  //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
+  //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
+  //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
+
+  #if defined( KOKKOS_ARCH_AVX512MIC )
+      #define KOKKOS_ENABLE_RFO_PREFETCH 1
+  #endif
+
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
+  #endif
+
+  #if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \
+      ( defined( __amd64 ) || defined( __amd64__ ) || \
+        defined( __x86_64 ) || defined( __x86_64__ ) || \
+	defined(__PPC64__) )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
+#endif
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_COMPILER_PGI )
+  #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
+  #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
+  //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_ENABLE_PRAGMA_VECTOR 1
+  //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
+#endif
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_COMPILER_NVCC )
+  #if defined( __CUDA_ARCH__ )
+    #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
+  #endif
+#endif
+
+//----------------------------------------------------------------------------
+// Define function marking macros if compiler specific macros are undefined:
+
+#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
+  #define KOKKOS_FORCEINLINE_FUNCTION  inline
+#endif
+
+#if !defined( KOKKOS_INLINE_FUNCTION )
+  #define KOKKOS_INLINE_FUNCTION  inline
+#endif
+
+#if !defined( KOKKOS_FUNCTION )
+  #define KOKKOS_FUNCTION /**/
+#endif
+
+#if !defined( KOKKOS_FUNCTION_DEFAULTED )
+  #define KOKKOS_INLINE_FUNCTION_DEFAULTED inline
+  #define KOKKOS_FUNCTION_DEFAULTED /**/
+#endif
+
+//----------------------------------------------------------------------------
+// Define empty macro for restrict if necessary:
+
+#if !defined( KOKKOS_RESTRICT )
+  #define KOKKOS_RESTRICT
+#endif
+
+//----------------------------------------------------------------------------
+// Define Macro for alignment:
+
+#if ! defined( KOKKOS_MEMORY_ALIGNMENT )
+  #define KOKKOS_MEMORY_ALIGNMENT 64
+#endif
+
+#if ! defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
+  #define KOKKOS_MEMORY_ALIGNMENT_THRESHOLD 1
+#endif
+
+#if !defined( KOKKOS_IMPL_ALIGN_PTR )
+  #define KOKKOS_IMPL_ALIGN_PTR(size) /* */
+#endif
+
+//----------------------------------------------------------------------------
+// Determine the default execution space for parallel dispatch.
+// There is zero or one default execution space specified.
+
+#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
+  #error "More than one KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_* specified."
+#endif
+
+// If default is not specified then chose from enabled execution spaces.
+// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
+#if   defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+#elif defined( KOKKOS_ENABLE_CUDA )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+#elif defined( KOKKOS_ENABLE_ROCM )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM
+#elif defined( KOKKOS_ENABLE_OPENMPTARGET )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET
+#elif defined( KOKKOS_ENABLE_OPENMP )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
+#elif defined( KOKKOS_ENABLE_THREADS )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
+#else
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
+#endif
+
+//----------------------------------------------------------------------------
+// Determine for what space the code is being compiled:
+
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_ENABLE_CUDA )
+  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+#elif   defined( __HCC__ ) && defined( __HCC_ACCELERATOR__ ) && defined( KOKKOS_ENABLE_ROCM )
+  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU
+#else
+  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+#endif
+
+//----------------------------------------------------------------------------
+
+#if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
+    ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
+  #if defined( KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN )
+    #define KOKKOS_ENABLE_POSIX_MEMALIGN 1
+  #endif
+#endif
+
+//----------------------------------------------------------------------------
+// If compiling with CUDA then must be using CUDA 8 or better
+// and use relocateable device code to enable the task policy.
+// nvcc relocatable device code option: --relocatable-device-code=true
+
+#if ( defined( KOKKOS_ENABLE_CUDA ) )
+  #if ( 8000 <= CUDA_VERSION ) && defined( KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE )
+  #define KOKKOS_ENABLE_TASKDAG
+  #endif
+#else
+  #define KOKKOS_ENABLE_TASKDAG
+#endif
+
+
+#if defined ( KOKKOS_ENABLE_CUDA )
+  #if ( 9000 <= CUDA_VERSION )
+  #define KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND
+  #endif
+#endif
+#endif // #ifndef KOKKOS_MACROS_HPP
+
diff --git a/packages/kokkos/core/src/Kokkos_MasterLock.hpp b/packages/kokkos/core/src/Kokkos_MasterLock.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2db22d2fdd6ce4131ca7c536b872130e8be7d0e3
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_MasterLock.hpp
@@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MASTER_LOCK_HPP
+#define KOKKOS_MASTER_LOCK_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos { namespace Experimental {
+
+// my be used to coordinate work between master instances
+// SHOULD NOT be used within a parallel algorithm
+//
+// This lock should be used with with a scoped lock guard
+// i.e. std::unique_lock<Lock>, std::lock_guard
+//
+// cannot be copied or moved
+// has the following functions available
+//
+// Lock()
+// ~Lock()
+//
+// void lock()
+// void unlock()
+// bool try_lock()
+//
+template <typename ExecutionSpace>
+class MasterLock;
+
+}} // namespace Kokkos::Experimental
+
+#endif //KOKKOS_MASTER_LOCK_HPP
+
diff --git a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf80004c22bde62af420e33c6e3dbb7b051a8763
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -0,0 +1,887 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MEMORYPOOL_HPP
+#define KOKKOS_MEMORYPOOL_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_ConcurrentBitset.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_SharedAlloc.hpp>
+
+namespace Kokkos {
+namespace Impl {
+/* Report violation of size constraints:
+ *   min_block_alloc_size <= max_block_alloc_size
+ *   max_block_alloc_size <= min_superblock_size 
+ *   min_superblock_size  <= max_superblock_size
+ *   min_superblock_size  <= min_total_alloc_size
+ *   min_superblock_size  <= min_block_alloc_size * 
+ *                           max_block_per_superblock
+ */
+void memory_pool_bounds_verification
+  ( size_t min_block_alloc_size
+  , size_t max_block_alloc_size
+  , size_t min_superblock_size
+  , size_t max_superblock_size
+  , size_t max_block_per_superblock
+  , size_t min_total_alloc_size
+  );
+}
+}
+
+namespace Kokkos {
+
+template< typename DeviceType >
+class MemoryPool {
+private:
+
+  typedef typename Kokkos::Impl::concurrent_bitset CB ;
+
+  enum : uint32_t { bits_per_int_lg2  = CB::bits_per_int_lg2 };
+  enum : uint32_t { state_shift       = CB::state_shift };
+  enum : uint32_t { state_used_mask   = CB::state_used_mask };
+  enum : uint32_t { state_header_mask = CB::state_header_mask };
+  enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
+  enum : uint32_t { max_bit_count     = CB::max_bit_count };
+
+  enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
+
+  /*  Each superblock has a concurrent bitset state
+   *  which is an array of uint32_t integers.
+   *    [ { block_count_lg2  : state_shift bits
+   *      , used_block_count : ( 32 - state_shift ) bits
+   *      }
+   *    , { block allocation bit set }* ]
+   *
+   *  As superblocks are assigned (allocated) to a block size
+   *  and released (deallocated) back to empty the superblock state
+   *  is concurrently updated.
+   */
+
+  /*  Mapping between block_size <-> block_state
+   *
+   *  block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift
+   *  block_size  = m_sb_size_lg2 - ( block_state >> state_shift )
+   *
+   *  Thus A_block_size < B_block_size  <=>  A_block_state > B_block_state
+   */
+
+  typedef typename DeviceType::memory_space base_memory_space ;
+
+  enum { accessible =
+           Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace 
+                                          , base_memory_space >::accessible };
+
+  typedef Kokkos::Impl::SharedAllocationTracker Tracker ;
+  typedef Kokkos::Impl::SharedAllocationRecord
+    < base_memory_space >  Record ;
+
+  Tracker    m_tracker ;
+  uint32_t * m_sb_state_array ;
+  uint32_t   m_sb_state_size ;
+  uint32_t   m_sb_size_lg2 ;
+  uint32_t   m_max_block_size_lg2 ;
+  uint32_t   m_min_block_size_lg2 ;
+  int32_t    m_sb_count ;
+  int32_t    m_hint_offset ;   // Offset to K * #block_size array of hints
+  int32_t    m_data_offset ;   // Offset to 0th superblock data
+  int32_t    m_unused_padding ;
+
+public:
+
+  /**\brief  The maximum size of a superblock and block */
+  enum : uint32_t { max_superblock_size      = 1LU << 31 /* 2 gigabytes */ };
+  enum : uint32_t { max_block_per_superblock = max_bit_count };
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  size_t capacity() const noexcept
+    { return size_t(m_sb_count) << m_sb_size_lg2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t min_block_size() const noexcept
+    { return ( 1LU << m_min_block_size_lg2 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t max_block_size() const noexcept
+    { return ( 1LU << m_max_block_size_lg2 ); }
+
+  struct usage_statistics {
+    size_t capacity_bytes ;       ///<  Capacity in bytes
+    size_t superblock_bytes ;     ///<  Superblock size in bytes
+    size_t max_block_bytes ;      ///<  Maximum block size in bytes
+    size_t min_block_bytes ;      ///<  Minimum block size in bytes
+    size_t capacity_superblocks ; ///<  Number of superblocks
+    size_t consumed_superblocks ; ///<  Superblocks assigned to allocations
+    size_t consumed_blocks ;  ///<  Number of allocations
+    size_t consumed_bytes ;   ///<  Bytes allocated
+    size_t reserved_blocks ;  ///<  Unallocated blocks in assigned superblocks
+    size_t reserved_bytes ;   ///<  Unallocated bytes in assigned superblocks
+  };
+
+  void get_usage_statistics( usage_statistics & stats ) const
+    {
+      Kokkos::HostSpace host ;
+
+      const size_t alloc_size = m_hint_offset * sizeof(uint32_t);
+
+      uint32_t * const sb_state_array = 
+        accessible ? m_sb_state_array : (uint32_t *) host.allocate(alloc_size);
+
+      if ( ! accessible ) {
+        Kokkos::Impl::DeepCopy< Kokkos::HostSpace , base_memory_space >
+          ( sb_state_array , m_sb_state_array , alloc_size );
+      }
+
+      stats.superblock_bytes = ( 1LU << m_sb_size_lg2 );
+      stats.max_block_bytes  = ( 1LU << m_max_block_size_lg2 );
+      stats.min_block_bytes  = ( 1LU << m_min_block_size_lg2 );
+      stats.capacity_bytes   = stats.superblock_bytes * m_sb_count ;
+      stats.capacity_superblocks = m_sb_count ;
+      stats.consumed_superblocks = 0 ;
+      stats.consumed_blocks = 0 ;
+      stats.consumed_bytes  = 0 ;
+      stats.reserved_blocks = 0 ;
+      stats.reserved_bytes  = 0 ;
+
+      const uint32_t * sb_state_ptr = sb_state_array ;
+
+      for ( int32_t i = 0 ; i < m_sb_count
+          ; ++i , sb_state_ptr += m_sb_state_size ) {
+
+        const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
+
+        if ( block_count_lg2 ) {
+          const uint32_t block_count    = 1u << block_count_lg2 ;
+          const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
+          const uint32_t block_size     = 1u << block_size_lg2 ;
+          const uint32_t block_used     = (*sb_state_ptr) & state_used_mask ;
+
+          stats.consumed_superblocks++ ;
+          stats.consumed_blocks += block_used ;
+          stats.consumed_bytes  += block_used * block_size ;
+          stats.reserved_blocks += block_count - block_used ;
+          stats.reserved_bytes  += (block_count - block_used ) * block_size ;
+        }
+      }
+
+      if ( ! accessible ) {
+        host.deallocate( sb_state_array, alloc_size );
+      }
+    }
+
+  void print_state( std::ostream & s ) const
+    {
+      Kokkos::HostSpace host ;
+
+      const size_t alloc_size = m_hint_offset * sizeof(uint32_t);
+
+      uint32_t * const sb_state_array = 
+        accessible ? m_sb_state_array : (uint32_t *) host.allocate(alloc_size);
+
+      if ( ! accessible ) {
+        Kokkos::Impl::DeepCopy< Kokkos::HostSpace , base_memory_space >
+          ( sb_state_array , m_sb_state_array , alloc_size );
+      }
+
+      const uint32_t * sb_state_ptr = sb_state_array ;
+
+      s << "pool_size(" << ( size_t(m_sb_count) << m_sb_size_lg2 ) << ")"
+        << " superblock_size(" << ( 1LU << m_sb_size_lg2 ) << ")" << std::endl ;
+
+      for ( int32_t i = 0 ; i < m_sb_count
+          ; ++i , sb_state_ptr += m_sb_state_size ) {
+
+        if ( *sb_state_ptr ) {
+
+          const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
+          const uint32_t block_size_lg2  = m_sb_size_lg2 - block_count_lg2 ;
+          const uint32_t block_count     = 1u << block_count_lg2 ;
+          const uint32_t block_used      = (*sb_state_ptr) & state_used_mask ;
+
+          s << "Superblock[ " << i << " / " << m_sb_count << " ] {"
+            << " block_size(" << ( 1 << block_size_lg2 ) << ")"
+            << " block_count( " << block_used
+            << " / " << block_count  << " )"
+            << std::endl ;
+        }
+      }
+
+      if ( ! accessible ) {
+        host.deallocate( sb_state_array, alloc_size );
+      }
+    }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION_DEFAULTED MemoryPool( MemoryPool && ) = default ;
+  KOKKOS_INLINE_FUNCTION_DEFAULTED MemoryPool( const MemoryPool & ) = default ;
+  KOKKOS_INLINE_FUNCTION_DEFAULTED MemoryPool & operator = ( MemoryPool && ) = default ;
+  KOKKOS_INLINE_FUNCTION_DEFAULTED MemoryPool & operator = ( const MemoryPool & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION MemoryPool()
+    : m_tracker()
+    , m_sb_state_array(0)
+    , m_sb_state_size(0)
+    , m_sb_size_lg2(0)
+    , m_max_block_size_lg2(0)
+    , m_min_block_size_lg2(0)
+    , m_sb_count(0)
+    , m_hint_offset(0)
+    , m_data_offset(0)
+    , m_unused_padding(0)
+    {}
+
+  /**\brief  Allocate a memory pool from 'memspace'.
+   *
+   *  The memory pool will have at least 'min_total_alloc_size' bytes
+   *  of memory to allocate divided among superblocks of at least
+   *  'min_superblock_size' bytes.  A single allocation must fit
+   *  within a single superblock, so 'min_superblock_size' must be
+   *  at least as large as the maximum single allocation.
+   *  Both 'min_total_alloc_size' and 'min_superblock_size'
+   *  are rounded up to the smallest power-of-two value that
+   *  contains the corresponding sizes.
+   *  Individual allocations will always consume a block of memory that
+   *  is also a power-of-two.  These roundings are made to enable
+   *  significant runtime performance improvements.
+   */
+  MemoryPool( const base_memory_space & memspace
+            , const size_t min_total_alloc_size
+            , size_t min_block_alloc_size = 0
+            , size_t max_block_alloc_size = 0
+            , size_t min_superblock_size  = 0
+            )
+    : m_tracker()
+    , m_sb_state_array(0)
+    , m_sb_state_size(0)
+    , m_sb_size_lg2(0)
+    , m_max_block_size_lg2(0)
+    , m_min_block_size_lg2(0)
+    , m_sb_count(0)
+    , m_hint_offset(0)
+    , m_data_offset(0)
+    , m_unused_padding(0)
+    {
+      const uint32_t int_align_lg2   = 3 ; /* align as int[8] */
+      const uint32_t int_align_mask  = ( 1u << int_align_lg2 ) - 1 ;
+      const uint32_t default_min_block_size       = 1u << 6  ; /* 64 bytes */
+      const uint32_t default_max_block_size       = 1u << 12 ;/* 4k bytes */
+      const uint32_t default_min_superblock_size  = 1u << 20 ;/* 1M bytes */
+
+      //--------------------------------------------------
+      // Default block and superblock sizes:
+
+      if ( 0 == min_block_alloc_size ) {
+        // Default all sizes:
+
+        min_superblock_size =
+          std::min( size_t(default_min_superblock_size)
+                  , min_total_alloc_size );
+
+        min_block_alloc_size =
+          std::min( size_t(default_min_block_size)
+                  , min_superblock_size );
+
+        max_block_alloc_size =
+          std::min( size_t(default_max_block_size)
+                  , min_superblock_size );
+      }
+      else if ( 0 == min_superblock_size ) {
+
+        // Choose superblock size as minimum of:
+        //   max_block_per_superblock * min_block_size
+        //   max_superblock_size
+        //   min_total_alloc_size
+
+        const size_t max_superblock =
+          min_block_alloc_size * max_block_per_superblock ;
+
+        min_superblock_size =
+          std::min( max_superblock ,
+          std::min( size_t(max_superblock_size)
+                  , min_total_alloc_size ) );
+      }
+
+      if ( 0 == max_block_alloc_size ) {
+        max_block_alloc_size = min_superblock_size ;
+      }
+
+      //--------------------------------------------------
+
+      /* Enforce size constraints:
+       *   min_block_alloc_size <= max_block_alloc_size
+       *   max_block_alloc_size <= min_superblock_size 
+       *   min_superblock_size  <= max_superblock_size
+       *   min_superblock_size  <= min_total_alloc_size
+       *   min_superblock_size  <= min_block_alloc_size * 
+       *                           max_block_per_superblock
+       */
+
+      Kokkos::Impl::memory_pool_bounds_verification
+        ( min_block_alloc_size
+        , max_block_alloc_size
+        , min_superblock_size
+        , max_superblock_size
+        , max_block_per_superblock
+        , min_total_alloc_size
+        );
+
+      //--------------------------------------------------
+      // Block and superblock size is power of two:
+      // Maximum value is 'max_superblock_size'
+
+      m_min_block_size_lg2 =
+        Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
+
+      m_max_block_size_lg2 =
+        Kokkos::Impl::integral_power_of_two_that_contains(max_block_alloc_size);
+  
+      m_sb_size_lg2 =
+        Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
+
+      {
+        // number of superblocks is multiple of superblock size that
+        // can hold min_total_alloc_size.
+
+        const uint64_t sb_size_mask = ( 1LU << m_sb_size_lg2 ) - 1 ;
+
+        m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
+      }
+
+      {
+        // Any superblock can be assigned to the smallest size block
+        // Size the block bitset to maximum number of blocks
+
+        const uint32_t max_block_count_lg2 =
+          m_sb_size_lg2 - m_min_block_size_lg2 ;
+
+        m_sb_state_size =
+          ( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
+      }
+
+      // Array of all superblock states
+
+      const size_t all_sb_state_size =
+        ( m_sb_count * m_sb_state_size + int_align_mask ) & ~int_align_mask ;
+
+      // Number of block sizes
+
+      const int32_t number_block_sizes =
+         1 + m_max_block_size_lg2 - m_min_block_size_lg2 ;
+
+      // Array length for possible block sizes
+      // Hint array is one uint32_t per block size
+
+      const int32_t block_size_array_size =
+        ( number_block_sizes + int_align_mask ) & ~int_align_mask ;
+
+      m_hint_offset = all_sb_state_size ;
+      m_data_offset = m_hint_offset +
+                      block_size_array_size * HINT_PER_BLOCK_SIZE ;
+
+      // Allocation:
+
+      const size_t header_size = m_data_offset * sizeof(uint32_t);
+      const size_t alloc_size  = header_size +
+                                 ( size_t(m_sb_count) << m_sb_size_lg2 );
+
+      Record * rec = Record::allocate( memspace , "MemoryPool" , alloc_size );
+
+      m_tracker.assign_allocated_record_to_uninitialized( rec );
+
+      m_sb_state_array = (uint32_t *) rec->data();
+
+      Kokkos::HostSpace host ;
+
+      uint32_t * const sb_state_array = 
+        accessible ? m_sb_state_array
+                   : (uint32_t *) host.allocate(header_size);
+
+      for ( int32_t i = 0 ; i < m_data_offset ; ++i ) sb_state_array[i] = 0 ;
+
+      // Initial assignment of empty superblocks to block sizes:
+
+      for ( int32_t i = 0 ; i < number_block_sizes ; ++i ) {
+        const uint32_t block_size_lg2  = i + m_min_block_size_lg2 ;
+        const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
+        const uint32_t block_state     = block_count_lg2 << state_shift ;
+        const uint32_t hint_begin = m_hint_offset + i * HINT_PER_BLOCK_SIZE ;
+
+        // for block size index 'i':
+        //   sb_id_hint  = sb_state_array[ hint_begin ];
+        //   sb_id_begin = sb_state_array[ hint_begin + 1 ];
+
+        const int32_t jbeg = ( i * m_sb_count ) / number_block_sizes ;
+        const int32_t jend = ( ( i + 1 ) * m_sb_count ) / number_block_sizes ;
+
+        sb_state_array[ hint_begin ] = uint32_t(jbeg);
+        sb_state_array[ hint_begin + 1 ] = uint32_t(jbeg);
+
+        for ( int32_t j = jbeg ; j < jend ; ++j ) {
+          sb_state_array[ j * m_sb_state_size ] = block_state ;
+        }
+      }
+
+      // Write out initialized state:
+
+      if ( ! accessible ) {
+        Kokkos::Impl::DeepCopy< base_memory_space , Kokkos::HostSpace >
+          ( m_sb_state_array , sb_state_array , header_size );
+
+        host.deallocate( sb_state_array, header_size );
+      }
+      else {
+        Kokkos::memory_fence();
+      }
+    }
+
+  //--------------------------------------------------------------------------
+
+private:
+
+  /* Given a size 'n' get the block size in which it can be allocated.
+   * Restrict lower bound to minimum block size.
+   */
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t get_block_size_lg2( uint32_t n ) const noexcept
+    {
+      const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains( n );
+
+      return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i ;
+    }
+
+public:
+
+  /* Return 0 for invalid block size */
+  KOKKOS_INLINE_FUNCTION
+  uint32_t allocate_block_size( uint64_t alloc_size ) const noexcept
+    {
+      return alloc_size <= (1UL << m_max_block_size_lg2)
+           ? ( 1UL << get_block_size_lg2( uint32_t(alloc_size) ) )
+           : 0 ;
+    }
+
+  //--------------------------------------------------------------------------
+  /**\brief  Allocate a block of memory that is at least 'alloc_size'
+   *
+   *  The block of memory is aligned to the minimum block size,
+   *  currently is 64 bytes, will never be less than 32 bytes.
+   *
+   *  If concurrent allocations and deallocations are taking place
+   *  then a single allocation attempt may fail due to lack of available space.
+   *  The allocation attempt will try up to 'attempt_limit' times.
+   */
+  KOKKOS_FUNCTION
+  void * allocate( size_t alloc_size
+                 , int32_t attempt_limit = 1 ) const noexcept
+    {
+      if ( size_t(1LU << m_max_block_size_lg2) < alloc_size ) {
+        Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
+      }
+
+      if ( 0 == alloc_size ) return (void*) 0 ;
+
+      void * p = 0 ;
+
+      const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
+
+      // Allocation will fit within a superblock
+      // that has block sizes ( 1 << block_size_lg2 )
+
+      const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
+      const uint32_t block_state     = block_count_lg2 << state_shift ;
+      const uint32_t block_count     = 1u << block_count_lg2 ;
+
+      // Superblock hints for this block size:
+      //   hint_sb_id_ptr[0] is the dynamically changing hint
+      //   hint_sb_id_ptr[1] is the static start point
+
+      volatile uint32_t * const hint_sb_id_ptr
+        = m_sb_state_array     /* memory pool state array */
+        + m_hint_offset        /* offset to hint portion of array */
+        + HINT_PER_BLOCK_SIZE  /* number of hints per block size */
+          * ( block_size_lg2 - m_min_block_size_lg2 ); /* block size id */
+
+      const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] );
+
+      // Fast query clock register 'tic' to pseudo-randomize
+      // the guess for which block within a superblock should
+      // be claimed.  If not available then a search occurs.
+
+      const uint32_t block_id_hint =
+        (uint32_t)( Kokkos::Impl::clock_tic()
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
+        // Spread out potentially concurrent access
+        // by threads within a warp or thread block.
+        + ( threadIdx.x + blockDim.x * threadIdx.y )
+#endif
+        );
+
+      // expected state of superblock for allocation
+      uint32_t sb_state = block_state ;
+
+      int32_t sb_id = -1 ;
+
+      volatile uint32_t * sb_state_array = 0 ;
+
+      while ( attempt_limit ) {
+
+        int32_t hint_sb_id = -1 ;
+
+        if ( sb_id < 0 ) {
+
+          // No superblock specified, try the hint for this block size
+
+          sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
+
+          sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
+        }
+
+        // Require:
+        //   0 <= sb_id
+        //   sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
+
+        if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
+
+          // This superblock state is as expected, for the moment.
+          // Attempt to claim a bit.  The attempt updates the state
+          // so have already made sure the state header is as expected.
+
+          const uint32_t count_lg2 = sb_state >> state_shift ;
+          const uint32_t mask      = ( 1u << count_lg2 ) - 1 ;
+
+          const Kokkos::pair<int,int> result =
+            CB::acquire_bounded_lg2( sb_state_array
+                                   , count_lg2
+                                   , block_id_hint & mask
+                                   , sb_state
+                                   );
+
+          // If result.first < 0 then failed to acquire
+          // due to either full or buffer was wrong state.
+          // Could be wrong state if a deallocation raced the
+          // superblock to empty before the acquire could succeed.
+
+          if ( 0 <= result.first ) { // acquired a bit
+
+            const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
+
+            // Set the allocated block pointer
+
+            p = ((char*)( m_sb_state_array + m_data_offset ))
+              + ( uint64_t(sb_id) << m_sb_size_lg2 ) // superblock memory
+              + ( uint64_t(result.first) << size_lg2 ); // block memory
+
+#if 0
+  printf( "  MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
+        , (uintptr_t)m_sb_state_array
+        , (uintptr_t)p
+        , alloc_size
+        , sb_id
+        , sb_state 
+        , (1u << size_lg2)
+        , (1u << count_lg2)
+        , result.first 
+        , result.second );
+#endif
+
+            break ; // Success
+          }
+        }
+        //------------------------------------------------------------------
+        //  Arrive here if failed to acquire a block.
+        //  Must find a new superblock.
+
+        //  Start searching at designated index for this block size.
+        //  Look for superblock that, in preferential order,
+        //  1) part-full superblock of this block size
+        //  2) empty superblock to claim for this block size
+        //  3) part-full superblock of the next larger block size
+
+        sb_state = block_state ; // Expect to find the desired state
+        sb_id = -1 ;
+
+        bool update_hint = false ;
+        int32_t sb_id_empty = -1 ;
+        int32_t sb_id_large = -1 ;
+        uint32_t sb_state_large = 0 ;
+
+        sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
+
+        for ( int32_t i = 0 , id = sb_id_begin ; i < m_sb_count ; ++i ) {
+
+          //  Query state of the candidate superblock.
+          //  Note that the state may change at any moment
+          //  as concurrent allocations and deallocations occur.
+          
+          const uint32_t full_state = *sb_state_array ;
+          const uint32_t used       = full_state & state_used_mask ;
+          const uint32_t state      = full_state & state_header_mask ;
+
+          if ( state == block_state ) {
+
+            //  Superblock is assigned to this block size
+
+            if ( used < block_count ) {
+
+              // There is room to allocate one block
+
+              sb_id = id ;
+
+              // Is there room to allocate more than one block?
+
+              update_hint = used + 1 < block_count ;
+
+              break ;
+            }
+          }
+          else if ( 0 == used ) {
+
+            // Superblock is empty
+
+            if ( -1 == sb_id_empty ) {
+
+              // Superblock is not assigned to this block size
+              // and is the first empty superblock encountered.
+              // Save this id to use if a partfull superblock is not found.
+
+              sb_id_empty = id ;
+            }
+          }
+          else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
+                    ( -1 == sb_id_large /* have not found a larger */ ) &&
+                    ( state < block_state /* a larger block */ ) &&
+                    // is not full:
+                    ( used < ( 1u << ( state >> state_shift ) ) ) ) {
+            //  First superblock encountered that is
+            //  larger than this block size and
+            //  has room for an allocation.
+            //  Save this id to use of partfull or empty superblock not found
+            sb_id_large    = id ;
+            sb_state_large = state ;
+          }
+
+          // Iterate around the superblock array:
+
+          if ( ++id < m_sb_count ) {
+            sb_state_array += m_sb_state_size ;
+          }
+          else {
+            id = 0 ;
+            sb_state_array = m_sb_state_array ;
+          }
+        }
+
+ // printf("  search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
+
+        if ( sb_id < 0 ) {
+
+          //  Did not find a partfull superblock for this block size.
+
+          if ( 0 <= sb_id_empty ) {
+
+            //  Found first empty superblock following designated superblock
+            //  Attempt to claim it for this block size.
+            //  If the claim fails assume that another thread claimed it
+            //  for this block size and try to use it anyway,
+            //  but do not update hint.
+
+            sb_id = sb_id_empty ;
+
+            sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
+
+            //  If successfully changed assignment of empty superblock 'sb_id'
+            //  to this block_size then update the hint.
+
+            const uint32_t state_empty = state_header_mask & *sb_state_array ;
+
+            // If this thread claims the empty block then update the hint
+            update_hint =
+              state_empty ==
+                Kokkos::atomic_compare_exchange
+                  (sb_state_array,state_empty,block_state);
+          }
+          else if ( 0 <= sb_id_large ) {
+
+            // Found a larger superblock with space available
+
+            sb_id    = sb_id_large ;
+            sb_state = sb_state_large ;
+
+            sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
+          }
+          else {
+            // Did not find a potentially usable superblock
+            --attempt_limit ;
+          }
+        }
+
+        if ( update_hint ) {
+          Kokkos::atomic_compare_exchange
+            ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
+        }
+      } // end allocation attempt loop
+      //--------------------------------------------------------------------
+
+      return p ;
+    }
+  // end allocate
+  //--------------------------------------------------------------------------
+
+  /**\brief  Return an allocated block of memory to the pool.
+   *
+   *  Requires: p is return value from allocate( alloc_size );
+   *
+   *  For now the alloc_size is ignored.
+   */
+  KOKKOS_INLINE_FUNCTION
+  void deallocate( void * p , size_t /* alloc_size */ ) const noexcept
+    {
+      if ( 0 == p ) return ;
+
+      // Determine which superblock and block
+      const ptrdiff_t d =
+        ((char*)p) - ((char*)( m_sb_state_array + m_data_offset ));
+
+      // Verify contained within the memory pool's superblocks:
+      const int ok_contains =
+        ( 0 <= d ) && ( size_t(d) < ( size_t(m_sb_count) << m_sb_size_lg2 ) );
+
+      int ok_block_aligned = 0 ;
+      int ok_dealloc_once  = 0 ;
+
+      if ( ok_contains ) {
+
+        const int sb_id = d >> m_sb_size_lg2 ;
+
+        // State array for the superblock.
+        volatile uint32_t * const sb_state_array =
+          m_sb_state_array + ( sb_id * m_sb_state_size );
+
+        const uint32_t block_state    = (*sb_state_array) & state_header_mask ;
+        const uint32_t block_size_lg2 =
+          m_sb_size_lg2 - ( block_state >> state_shift );
+
+        ok_block_aligned = 0 == ( d & ( ( 1UL << block_size_lg2 ) - 1 ) );
+
+        if ( ok_block_aligned ) {
+
+          // Map address to block's bit
+          // mask into superblock and then shift down for block index
+
+          const uint32_t bit =
+            ( d & ( ptrdiff_t( 1LU << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
+
+          const int result =
+            CB::release( sb_state_array , bit , block_state );
+
+          ok_dealloc_once = 0 <= result ;
+
+#if 0
+  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
+        , (uintptr_t)m_sb_state_array
+        , (uintptr_t)p
+        , sb_id
+        , (1u << block_size_lg2)
+        , (1u << (m_sb_size_lg2 - block_size_lg2))
+        , bit
+        , result );
+#endif
+        }
+      }
+
+      if ( ! ok_contains || ! ok_block_aligned || ! ok_dealloc_once ) {
+#if 0
+  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
+        , (uintptr_t)m_sb_state_array
+        , (uintptr_t)p
+        , int(ok_contains)
+        , int(ok_block_aligned)
+        , int(ok_dealloc_once) );
+#endif
+        Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
+      }
+    }
+  // end deallocate
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int number_of_superblocks() const noexcept { return m_sb_count ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void superblock_state( int sb_id
+                       , int & block_size
+                       , int & block_count_capacity
+                       , int & block_count_used ) const noexcept
+    {
+      block_size           = 0 ;
+      block_count_capacity = 0 ;
+      block_count_used     = 0 ;
+
+      if ( Kokkos::Impl::MemorySpaceAccess
+             < Kokkos::Impl::ActiveExecutionMemorySpace
+             , base_memory_space >::accessible ) {
+       // Can access the state array
+       
+        const uint32_t state =
+          ((uint32_t volatile *)m_sb_state_array)[sb_id*m_sb_state_size];
+
+        const uint32_t block_count_lg2 = state >> state_shift ;
+        const uint32_t block_used      = state & state_used_mask ;
+
+        block_size           = 1LU << ( m_sb_size_lg2 - block_count_lg2 );
+        block_count_capacity = 1LU << block_count_lg2 ;
+        block_count_used     = block_used ;
+      }
+    }
+};
+
+} // namespace Kokkos 
+
+#endif /* #ifndef KOKKOS_MEMORYPOOL_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..eebc83cf3ddc12a745f7aa273321add249716d2a
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
@@ -0,0 +1,120 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MEMORYTRAITS_HPP
+#define KOKKOS_MEMORYTRAITS_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Memory access traits for views, an extension point.
+ *
+ *  These traits should be orthogonal.  If there are dependencies then
+ *  the MemoryTraits template must detect and enforce dependencies.
+ *
+ *  A zero value is the default for a View, indicating that none of
+ *  these traits are present.
+ */
+enum MemoryTraitsFlags
+  { Unmanaged  = 0x01
+  , RandomAccess = 0x02
+  , Atomic = 0x04
+  , Restrict = 0x08
+  , Aligned = 0x10
+  };
+
+template < unsigned T >
+struct MemoryTraits {
+  //! Tag this class as a kokkos memory traits:
+  typedef MemoryTraits memory_traits ;
+
+  enum : bool { Unmanaged    = (unsigned(0) != (T & unsigned(Kokkos::Unmanaged))) };
+  enum : bool { RandomAccess = (unsigned(0) != (T & unsigned(Kokkos::RandomAccess))) };
+  enum : bool { Atomic       = (unsigned(0) != (T & unsigned(Kokkos::Atomic))) };
+  enum : bool { Restrict     = (unsigned(0) != (T & unsigned(Kokkos::Restrict))) };
+  enum : bool { Aligned      = (unsigned(0) != (T & unsigned(Kokkos::Aligned))) };
+
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+typedef Kokkos::MemoryTraits<0> MemoryManaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged > MemoryUnmanaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryRandomAccess ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+static_assert(
+  ( 0 < int(KOKKOS_MEMORY_ALIGNMENT) ) &&
+  ( 0 == ( int(KOKKOS_MEMORY_ALIGNMENT) & (int(KOKKOS_MEMORY_ALIGNMENT)-1))) ,
+  "KOKKOS_MEMORY_ALIGNMENT must be a power of two" );
+
+/** \brief Memory alignment settings
+ *
+ *  Sets global value for memory alignment.  Must be a power of two!
+ *  Enable compatibility of views from different devices with static stride.
+ *  Use compiler flag to enable overwrites.
+ */
+enum : unsigned
+  { MEMORY_ALIGNMENT           = KOKKOS_MEMORY_ALIGNMENT
+  , MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
+  };
+
+} //namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..baac78254526d9fc375c05be3ae468c6d2171006
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
@@ -0,0 +1,219 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_NUMERICTRAITS_HPP
+#define KOKKOS_NUMERICTRAITS_HPP
+
+#include<climits>
+#include<cfloat>
+
+namespace Kokkos {
+
+template<class T>
+struct reduction_identity; /*{
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T sum() { return T(); }  // 0
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T prod()  // 1
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom prod reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T max()   // minimum value
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom max reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T min()   // maximum value
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom min reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T bor()   // 0, only for integer type
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom bor reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T band()  // !0, only for integer type
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom band reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T lor()   // 0, only for integer type
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom lor reduction type"); return T(); }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static T land()  // !0, only for integer type
+    { static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom land reduction type"); return T(); }
+};*/
+
+template<>
+struct reduction_identity<signed char> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char sum()  {return static_cast<signed char>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char prod() {return static_cast<signed char>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char max()  {return SCHAR_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char min()  {return SCHAR_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char bor()  {return static_cast<signed char>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char band() {return ~static_cast<signed char>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char lor()  {return static_cast<signed char>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char land() {return static_cast<signed char>(1);}
+};
+
+template<>
+struct reduction_identity<short> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short sum()  {return static_cast<short>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short prod() {return static_cast<short>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short max()  {return SHRT_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short min()  {return SHRT_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short bor()  {return static_cast<short>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short band() {return ~static_cast<short>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short lor()  {return static_cast<short>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static short land() {return static_cast<short>(1);}
+};
+
+template<>
+struct reduction_identity<int> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int sum()  {return static_cast<int>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int prod() {return static_cast<int>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int max()  {return INT_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int min()  {return INT_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int bor()  {return static_cast<int>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int band() {return ~static_cast<int>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int lor()  {return static_cast<int>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static int land() {return static_cast<int>(1);}
+};
+
+template<>
+struct reduction_identity<long> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long sum()  {return static_cast<long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long prod() {return static_cast<long>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long max()  {return LLONG_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long min()  {return LLONG_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long bor()  {return static_cast<long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long band() {return ~static_cast<long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long lor()  {return static_cast<long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long land() {return static_cast<long>(1);}
+};
+
+template<>
+struct reduction_identity<long long> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long sum()  {return static_cast<long long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long prod() {return static_cast<long long>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long max()  {return LLONG_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long min()  {return LLONG_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long bor()  {return static_cast<long long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long band() {return ~static_cast<long long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long lor()  {return static_cast<long long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long long land() {return static_cast<long long>(1);}
+};
+
+template<>
+struct reduction_identity<unsigned char> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char sum()  {return static_cast<unsigned char>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char prod() {return static_cast<unsigned char>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char max()  {return static_cast<unsigned char>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char min()  {return UCHAR_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char bor()  {return static_cast<unsigned char>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char band() {return ~static_cast<unsigned char>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char lor()  {return static_cast<unsigned char>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char land() {return static_cast<unsigned char>(1);}
+};
+
+template<>
+struct reduction_identity<unsigned short> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short sum()  {return static_cast<unsigned short>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short prod() {return static_cast<unsigned short>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short max()  {return static_cast<unsigned short>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short min()  {return USHRT_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short bor()  {return static_cast<unsigned short>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short band() {return ~static_cast<unsigned short>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short lor()  {return static_cast<unsigned short>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short land() {return static_cast<unsigned short>(1);}
+};
+
+template<>
+struct reduction_identity<unsigned int> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int sum()  {return static_cast<unsigned int>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int prod() {return static_cast<unsigned int>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int max()  {return static_cast<unsigned int>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int min()  {return UINT_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int bor()  {return static_cast<unsigned int>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int band() {return ~static_cast<unsigned int>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int lor()  {return static_cast<unsigned int>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int land() {return static_cast<unsigned int>(1);}
+};
+
+template<>
+struct reduction_identity<unsigned long> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long sum()  {return static_cast<unsigned long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long prod() {return static_cast<unsigned long>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long max()  {return static_cast<unsigned long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long min()  {return ULONG_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long bor()  {return static_cast<unsigned long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long band() {return ~static_cast<unsigned long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long lor()  {return static_cast<unsigned long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long land() {return static_cast<unsigned long>(1);}
+};
+
+template<>
+struct reduction_identity<unsigned long long> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long sum()  {return static_cast<unsigned long long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long prod() {return static_cast<unsigned long long>(1);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long max()  {return static_cast<unsigned long long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long min()  {return ULLONG_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long bor()  {return static_cast<unsigned long long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long band() {return ~static_cast<unsigned long long>(0x0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long lor()  {return static_cast<unsigned long long>(0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long land() {return static_cast<unsigned long long>(1);}
+};
+
+template<>
+struct reduction_identity<float> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum()  {return static_cast<float>(0.0f);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max()  {return -FLT_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float min()  {return FLT_MAX;}
+};
+
+template<>
+struct reduction_identity<double> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum()  {return static_cast<double>(0.0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double max()  {return -DBL_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double min()  {return DBL_MAX;}
+};
+
+#if !defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
+template<>
+struct reduction_identity<long double> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum()  {return static_cast<long double>(0.0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max()  {return -LDBL_MAX;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min()  {return LDBL_MAX;}
+};
+#endif
+
+}
+
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..769252e8b98c6bad942413a6d9837e15ce6572c7
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -0,0 +1,252 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_HPP
+#define KOKKOS_OPENMP_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP)
+
+#include <Kokkos_Core_fwd.hpp>
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+
+#ifdef KOKKOS_ENABLE_HBWSPACE
+#include <Kokkos_HBWSpace.hpp>
+#endif
+
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskScheduler.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+#include <vector>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+namespace Impl {
+class OpenMPExec;
+}
+
+/// \class OpenMP
+/// \brief Kokkos device for multicore processors in the host memory space.
+class OpenMP {
+public:
+  //! Tag this class as a kokkos execution space
+  using execution_space = OpenMP;
+
+  using memory_space =
+  #ifdef KOKKOS_ENABLE_HBWSPACE
+    Experimental::HBWSpace;
+  #else
+    HostSpace;
+  #endif
+
+  //! This execution space preferred device_type
+  using device_type          = Kokkos::Device< execution_space, memory_space >;
+  using array_layout         = LayoutRight;
+  using size_type            = memory_space::size_type;
+  using scratch_memory_space = ScratchMemorySpace< OpenMP >;
+
+  /// \brief Get a handle to the default execution space instance
+  inline
+  OpenMP() noexcept;
+
+  // Using omp_get_max_threads(); is problematic
+  // On Intel (essentially an initial call to the OpenMP runtime
+  // without a parallel region before will set a process mask for a single core
+  // The runtime will than bind threads for a parallel region to other cores on the
+  // entering the first parallel region and make the process mask the aggregate of
+  // the thread masks. The intend seems to be to make serial code run fast, if you
+  // compile with OpenMP enabled but don't actually use parallel regions or so
+  // static int omp_max_threads = omp_get_max_threads();
+  static int get_current_max_threads() noexcept;
+
+  /// \brief Initialize the default execution space
+  ///
+  /// if ( thread_count == -1 )
+  ///   then use the number of threads that openmp defaults to
+  /// if ( thread_count == 0 && Kokkos::hwlow_available() )
+  ///   then use hwloc to choose the number of threads and change
+  ///   the default number of threads
+  /// if ( thread_count > 0 )
+  ///   then force openmp to use the given number of threads and change
+  ///   the default number of threads
+  static void initialize( int thread_count = -1 );
+
+  /// \brief Free any resources being consumed by the default execution space
+  static void finalize();
+
+  /// \brief is the default execution space initialized for current 'master' thread
+  static bool is_initialized() noexcept;
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool verbose = false );
+
+  /// \brief is the instance running a parallel algorithm
+  inline
+  static bool in_parallel( OpenMP const& = OpenMP() ) noexcept;
+
+  /// \brief Wait until all dispatched functors complete on the given instance
+  ///
+  ///  This is a no-op on OpenMP
+  inline
+  static void fence( OpenMP const& = OpenMP() ) noexcept;
+
+  /// \brief Does the given instance return immediately after launching
+  /// a parallel algorithm
+  ///
+  /// This always returns false on OpenMP
+  inline
+  static bool is_asynchronous( OpenMP const& = OpenMP() ) noexcept;
+
+
+  /// \brief Partition the default instance into new instances without creating
+  ///  new masters
+  ///
+  /// This is a no-op on OpenMP since the default instance cannot be partitioned
+  /// without promoting other threads to 'master'
+  static std::vector<OpenMP> partition(...);
+
+  /// Non-default instances should be ref-counted so that when the last
+  /// is destroyed the instance resources are released
+  ///
+  /// This is a no-op on OpenMP since a non default instance cannot be created
+  static OpenMP create_instance(...);
+
+  /// \brief Partition the default instance and call 'f' on each new 'master' thread
+  ///
+  /// Func is a functor with the following signiture
+  ///   void( int partition_id, int num_partitions )
+  template <typename F>
+  static void partition_master( F const& f
+                              , int requested_num_partitions = 0
+                              , int requested_partition_size = 0
+                              );
+
+  inline
+  static int thread_pool_size() noexcept;
+
+  /** \brief  The rank of the executing thread in this thread pool */
+  KOKKOS_INLINE_FUNCTION
+  static int thread_pool_rank() noexcept;
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+  /// \brief Initialize the default execution space
+  static void initialize( int thread_count,
+                          int use_numa_count,
+                          int use_cores_per_numa = 0);
+
+  inline
+  static int thread_pool_size( int depth );
+
+  static void sleep() {};
+  static void wake() {};
+
+  // use UniqueToken
+  static int concurrency();
+
+  // use UniqueToken
+  inline
+  static int max_hardware_threads() noexcept;
+
+  // use UniqueToken
+  KOKKOS_INLINE_FUNCTION
+  static int hardware_thread_id() noexcept;
+#endif
+
+  static constexpr const char* name() noexcept { return "OpenMP"; }
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct MemorySpaceAccess
+  < Kokkos::OpenMP::memory_space
+  , Kokkos::OpenMP::scratch_memory_space
+  >
+{
+  enum { assignable = false };
+  enum { accessible = true };
+  enum { deepcopy   = false };
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::OpenMP::memory_space
+  , Kokkos::OpenMP::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Team.hpp>
+#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
+#include <OpenMP/Kokkos_OpenMP_Task.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+/*--------------------------------------------------------------------------*/
+
+#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP ) */
+#endif /* #ifndef KOKKOS_OPENMP_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e2dd249ff7897cf1dea0c58ca1e10307845dfa64
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
@@ -0,0 +1,186 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPTARGET_HPP
+#define KOKKOS_OPENMPTARGET_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP )
+
+#include <omp.h>
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_OpenMPTargetSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+/// \class OpenMPTarget
+/// \brief Kokkos device for multicore processors in the host memory space.
+class OpenMPTarget {
+public:
+  //------------------------------------
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as a kokkos execution space
+  typedef OpenMPTarget                execution_space ;
+  typedef OpenMPTargetSpace           memory_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef LayoutLeft           array_layout ;
+  typedef memory_space::size_type  size_type ;
+
+  typedef ScratchMemorySpace< OpenMPTarget > scratch_memory_space ;
+
+  //@}
+  //------------------------------------
+  //! \name Functions that all Kokkos execution spaces must implement.
+  //@{
+
+  inline static bool in_parallel() { return omp_in_parallel(); }
+
+  /** \brief  Set the device in a "sleep" state. A noop for OpenMPTarget.  */
+  static bool sleep();
+
+  /** \brief Wake the device from the 'sleep' state. A noop for OpenMPTarget. */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functors complete. A noop for OpenMPTarget. */
+  static void fence() {}
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  /// \brief Free any resources being consumed by the device.
+  static void finalize();
+
+  /** \brief  Initialize the device.
+   *
+   *  1) If the hardware locality library is enabled and OpenMPTarget has not
+   *     already bound threads then bind OpenMPTarget threads to maximize
+   *     core utilization and group for memory hierarchy locality.
+   *
+   *  2) Allocate a HostThread for each OpenMPTarget thread to hold its
+   *     topology and fan in/out data.
+   */
+  static void initialize( unsigned thread_count = 0 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 );
+
+  static int is_initialized();
+
+  /** \brief  Return the maximum amount of concurrency.  */
+  static int concurrency();
+
+  //@}
+  //------------------------------------
+  /** \brief  This execution space has a topological thread pool which can be queried.
+   *
+   *  All threads within a pool have a common memory space for which they are cache coherent.
+   *    depth = 0  gives the number of threads in the whole pool.
+   *    depth = 1  gives the number of threads in a NUMA region, typically sharing L3 cache.
+   *    depth = 2  gives the number of threads at the finest granularity, typically sharing L1 cache.
+   */
+  inline static int thread_pool_size( int depth = 0 );
+
+  /** \brief  The rank of the executing thread in this thread pool */
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
+
+  //------------------------------------
+
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned hardware_thread_id() { return thread_pool_rank(); }
+
+  static const char* name();
+private:
+  static bool m_is_initialized;
+};
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Experimental::OpenMPTarget::memory_space
+  , Kokkos::Experimental::OpenMPTarget::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP ) */
+#endif /* #ifndef KOKKOS_OPENMPTARGET_HPP */
+
+
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f2491900ff6c5448b34f27986542669a36474e6c
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
@@ -0,0 +1,265 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPTARGETSPACE_HPP
+#define KOKKOS_OPENMPTARGETSPACE_HPP
+
+#include <cstring>
+#include <string>
+#include <iosfwd>
+#include <typeinfo>
+
+#include <Kokkos_Core_fwd.hpp>
+
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+
+#include <Kokkos_HostSpace.hpp>
+#include <omp.h>
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+//void init_lock_array_host_space();
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+//bool lock_address_host_space(void* ptr);
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+//void unlock_address_host_space(void* ptr);
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+/// \class OpenMPTargetSpace
+/// \brief Memory management for host memory.
+///
+/// OpenMPTargetSpace is a memory space that governs host memory.  "Host"
+/// memory means the usual CPU-accessible memory.
+class OpenMPTargetSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef OpenMPTargetSpace  memory_space ;
+  typedef size_t     size_type ;
+
+  /// \typedef execution_space
+  /// \brief Default execution space for this memory space.
+  ///
+  /// Every memory space has a default execution space.  This is
+  /// useful for things like initializing a View (which happens in
+  /// parallel using the View's default execution space).
+  typedef Kokkos::Experimental::OpenMPTarget   execution_space ;
+
+  //! This memory space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  /*--------------------------------*/
+
+  /**\brief  Default memory space instance */
+  OpenMPTargetSpace();
+  OpenMPTargetSpace( OpenMPTargetSpace && rhs ) = default ;
+  OpenMPTargetSpace( const OpenMPTargetSpace & rhs ) = default ;
+  OpenMPTargetSpace & operator = ( OpenMPTargetSpace && ) = default ;
+  OpenMPTargetSpace & operator = ( const OpenMPTargetSpace & ) = default ;
+  ~OpenMPTargetSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate( void * const arg_alloc_ptr 
+                 , const size_t arg_alloc_size ) const ;
+
+private:
+
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > ;
+};
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  friend Kokkos::Experimental::OpenMPTargetSpace ;
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  /**\brief  Root record for tracked allocations from this OpenMPTargetSpace instance */
+  static RecordBase s_root_record ;
+
+  const Kokkos::Experimental::OpenMPTargetSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() = default ;
+
+  SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const;
+
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const Kokkos::Experimental::OpenMPTargetSpace &  arg_space
+                                   , const std::string       &  arg_label
+                                   , const size_t               arg_alloc_size
+                                   );
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::Experimental::OpenMPTargetSpace & , bool detail = false );
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//TODO: implement all possible deep_copies
+template<class ExecutionSpace>
+struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,Kokkos::Experimental::OpenMPTargetSpace,ExecutionSpace> {
+  DeepCopy( void * dst , const void * src , size_t n ) {
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_default_device());
+  }
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+    exec.fence();
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_default_device());
+  }
+};
+
+
+template<class ExecutionSpace>
+struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,HostSpace,ExecutionSpace> {
+  DeepCopy( void * dst , const void * src , size_t n ) {
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_initial_device());
+  }
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+    exec.fence();
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_initial_device());
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy<HostSpace,Kokkos::Experimental::OpenMPTargetSpace,ExecutionSpace> {
+  DeepCopy( void * dst , const void * src , size_t n ) {
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_initial_device(), omp_get_default_device());
+  }
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+    exec.fence();
+    omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_initial_device(), omp_get_default_device());
+  }
+};
+
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::OpenMPTargetSpace >
+{
+  enum { value = false };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif
+#endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_Pair.hpp b/packages/kokkos/core/src/Kokkos_Pair.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..eaa85122f701bb95dac056a452a7fb08551aa08f
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Pair.hpp
@@ -0,0 +1,527 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+/// \file Kokkos_Pair.hpp
+/// \brief Declaration and definition of Kokkos::pair.
+///
+/// This header file declares and defines Kokkos::pair and its related
+/// nonmember functions.
+
+#ifndef KOKKOS_PAIR_HPP
+#define KOKKOS_PAIR_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <utility>
+
+namespace Kokkos {
+/// \struct pair
+/// \brief Replacement for std::pair that works on CUDA devices.
+///
+/// The instance methods of std::pair, including its constructors, are
+/// not marked as <tt>__device__</tt> functions.  Thus, they cannot be
+/// called on a CUDA device, such as an NVIDIA GPU.  This struct
+/// implements the same interface as std::pair, but can be used on a
+/// CUDA device as well as on the host.
+template <class T1, class T2>
+struct pair
+{
+  //! The first template parameter of this class.
+  typedef T1 first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Default constructor.
+  ///
+  /// This calls the default constructors of T1 and T2.  It won't
+  /// compile if those default constructors are not defined and
+  /// public.
+  KOKKOS_FUNCTION_DEFAULTED constexpr
+  pair() = default ;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair(first_type const& f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair( const volatile pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, T2> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+
+  /// \brief Assignment operator, for volatile <tt>*this</tt>.
+  ///
+  /// \param p [in] Input; right-hand side of the assignment.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It will not
+  /// compile if the assignment operators are not defined and public.
+  ///
+  /// This operator returns \c void instead of <tt>volatile pair<T1,
+  /// T2>& </tt>.  See Kokkos Issue #177 for the explanation.  In
+  /// practice, this means that you should not chain assignments with
+  /// volatile lvalues.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator=(const volatile pair<U,V> &p) volatile
+  {
+    first = p.first;
+    second = p.second;
+    // We deliberately do not return anything here.  See explanation
+    // in public documentation above.
+  }
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1& first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair(first_type f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1  first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair(first_type const& f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2>
+{
+  //! The first template parameter of this class.
+  typedef T1&  first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair(first_type f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+//! Equality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first==rhs.first && lhs.second==rhs.second; }
+
+//! Inequality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs==rhs); }
+
+//! Less-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+bool operator<  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
+
+//! Less-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(rhs<lhs); }
+
+//! Greater-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+bool operator>  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return rhs<lhs; }
+
+//! Greater-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs<rhs); }
+
+/// \brief Return a new pair.
+///
+/// This is a "nonmember constructor" for Kokkos::pair.  It works just
+/// like std::make_pair.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+pair<T1,T2> make_pair (T1 x, T2 y)
+{ return ( pair<T1,T2>(x,y) ); }
+
+/// \brief Return a pair of references to the input arguments.
+///
+/// This compares to std::tie (new in C++11).  You can use it to
+/// assign to two variables at once, from the result of a function
+/// that returns a pair.  For example (<tt>__device__</tt> and
+/// <tt>__host__</tt> attributes omitted for brevity):
+/// \code
+/// // Declaration of the function to call.
+/// // First return value: operation count.
+/// // Second return value: whether all operations succeeded.
+/// Kokkos::pair<int, bool> someFunction ();
+///
+/// // Code that uses Kokkos::tie.
+/// int myFunction () {
+///   int count = 0;
+///   bool success = false;
+///
+///   // This assigns to both count and success.
+///   Kokkos::tie (count, success) = someFunction ();
+///
+///   if (! success) {
+///     // ... Some operation failed;
+///     //     take corrective action ...
+///   }
+///   return count;
+/// }
+/// \endcode
+///
+/// The line that uses tie() could have been written like this:
+/// \code
+///   Kokkos::pair<int, bool> result = someFunction ();
+///   count = result.first;
+///   success = result.second;
+/// \endcode
+///
+/// Using tie() saves two lines of code and avoids a copy of each
+/// element of the pair.  The latter could be significant if one or
+/// both elements of the pair are more substantial objects than \c int
+/// or \c bool.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+pair<T1 &,T2 &> tie (T1 & x, T2 & y)
+{ return ( pair<T1 &,T2 &>(x,y) ); }
+
+//
+// Specialization of Kokkos::pair for a \c void second argument.  This
+// is not actually a "pair"; it only contains one element, the first.
+//
+template <class T1>
+struct pair<T1,void>
+{
+  typedef T1 first_type;
+  typedef void second_type;
+
+  first_type  first;
+  enum { second = 0 };
+
+  KOKKOS_FUNCTION_DEFAULTED constexpr
+  pair() = default ;
+
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair(const first_type & f)
+    : first(f)
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair(const first_type & f, int)
+    : first(f)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair( const pair<U,void> &p)
+    : first(p.first)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, void> & operator=(const pair<U,void> &p)
+  {
+    first = p.first;
+    return *this;
+  }
+};
+
+//
+// Specialization of relational operators for Kokkos::pair<T1,void>.
+//
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first==rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs==rhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+bool operator<  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first<rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(rhs<lhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+bool operator>  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return rhs<lhs; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION constexpr
+bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs<rhs); }
+
+} // namespace Kokkos
+
+
+#endif //KOKKOS_PAIR_HPP
+
diff --git a/packages/kokkos/core/src/Kokkos_Parallel.hpp b/packages/kokkos/core/src/Kokkos_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..783cc7fde4a64e5800e055eb493fc246d5a4e26f
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Parallel.hpp
@@ -0,0 +1,532 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Parallel.hpp
+/// \brief Declaration of parallel operators
+
+#ifndef KOKKOS_PARALLEL_HPP
+#define KOKKOS_PARALLEL_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#ifdef KOKKOS_DEBUG
+#include<iostream>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Given a Functor and Execution Policy query an execution space.
+ *
+ *  if       the Policy has an execution space use that
+ *  else if  the Functor has an execution_space use that
+ *  else if  the Functor has a device_type use that for backward compatibility
+ *  else     use the default
+ */
+template< class Functor
+        , class Policy
+        , class EnableFunctor
+        , class EnablePolicy
+        >
+struct FunctorPolicyExecutionSpace {
+  typedef Kokkos::DefaultExecutionSpace execution_space ;
+};
+
+template< class Functor , class Policy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::device_type     >::type
+  , typename enable_if_type< typename Policy ::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::execution_space >::type
+  , typename enable_if_type< typename Policy ::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy , class EnableFunctor >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , EnableFunctor
+  , typename enable_if_type< typename Policy::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy , class EnablePolicy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::device_type >::type
+  , EnablePolicy
+  >
+{
+  typedef typename Functor::device_type execution_space ;
+};
+
+template< class Functor , class Policy , class EnablePolicy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::execution_space >::type
+  , EnablePolicy
+  >
+{
+  typedef typename Functor::execution_space execution_space ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief Execute \c functor in parallel according to the execution \c policy.
+ *
+ * A "functor" is a class containing the function to execute in parallel,
+ * data needed for that execution, and an optional \c execution_space
+ * typedef.  Here is an example functor for parallel_for:
+ *
+ * \code
+ *  class FunctorType {
+ *  public:
+ *    typedef  ...  execution_space ;
+ *    void operator() ( WorkType iwork ) const ;
+ *  };
+ * \endcode
+ *
+ * In the above example, \c WorkType is any integer type for which a
+ * valid conversion from \c size_t to \c IntType exists.  Its
+ * <tt>operator()</tt> method defines the operation to parallelize,
+ * over the range of integer indices <tt>iwork=[0,work_count-1]</tt>.
+ * This compares to a single iteration \c iwork of a \c for loop.
+ * If \c execution_space is not defined DefaultExecutionSpace will be used.
+ */
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_for( const ExecPolicy  & policy
+                 , const FunctorType & functor
+                 , const std::string& str = ""
+                 , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
+                 )
+{
+#if defined(KOKKOS_ENABLE_PROFILING)
+  uint64_t kpID = 0;
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecPolicy::work_tag> name(str);
+    Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID);
+  }
+#endif
+
+    Kokkos::Impl::shared_allocation_tracking_disable();
+    Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy );
+    Kokkos::Impl::shared_allocation_tracking_enable();
+
+   closure.execute();
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::endParallelFor(kpID);
+  }
+#endif
+}
+
+template< class FunctorType >
+inline
+void parallel_for( const size_t        work_count
+                 , const FunctorType & functor
+                 , const std::string& str = ""
+                 )
+{
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+  typedef RangePolicy< execution_space > policy ;
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+  uint64_t kpID = 0;
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str);
+    Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID);
+  }
+#endif
+
+  Kokkos::Impl::shared_allocation_tracking_disable();
+  Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) );
+  Kokkos::Impl::shared_allocation_tracking_enable();
+
+  closure.execute();
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::endParallelFor(kpID);
+     }
+#endif
+}
+
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_for( const std::string & str
+                 , const ExecPolicy  & policy
+                 , const FunctorType & functor )
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl;
+  #endif
+
+  ::Kokkos::parallel_for(policy,functor,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_for kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+}
+
+#include <Kokkos_Parallel_Reduce.hpp>
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/// \fn parallel_scan
+/// \tparam ExecutionPolicy The execution policy type.
+/// \tparam FunctorType     The scan functor type.
+///
+/// \param policy  [in] The execution policy.
+/// \param functor [in] The scan functor.
+///
+/// This function implements a parallel scan pattern.  The scan can
+/// be either inclusive or exclusive, depending on how you implement
+/// the scan functor.
+///
+/// A scan functor looks almost exactly like a reduce functor, except
+/// that its operator() takes a third \c bool argument, \c final_pass,
+/// which indicates whether this is the last pass of the scan
+/// operation.  We will show below how to use the \c final_pass
+/// argument to control whether the scan is inclusive or exclusive.
+///
+/// Here is the minimum required interface of a scan functor for a POD
+/// (plain old data) value type \c PodType.  That is, the result is a
+/// View of zero or more PodType.  It is also possible for the result
+/// to be an array of (same-sized) arrays of PodType, but we do not
+/// show the required interface for that here.
+/// \code
+/// template< class ExecPolicy , class FunctorType >
+/// class ScanFunctor {
+/// public:
+///   // The Kokkos device type
+///   typedef ... execution_space;
+///   // Type of an entry of the array containing the result;
+///   // also the type of each of the entries combined using
+///   // operator() or join().
+///   typedef PodType value_type;
+///
+///   void operator () (const ExecPolicy::member_type & i, value_type& update, const bool final_pass) const;
+///   void init (value_type& update) const;
+///   void join (volatile value_type& update, volatile const value_type& input) const
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an inclusive plus-scan
+/// of an array of \c int, in place.  If given an array [1, 2, 3, 4], this
+/// scan will overwrite that array with [1, 3, 6, 10].
+///
+/// \code
+/// template<class SpaceType>
+/// class InclScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   InclScanFunctor( Kokkos::View<value_type*, execution_space> x
+///                  , Kokkos::View<value_type*, execution_space> y ) : m_x(x), m_y(y) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     update += m_x(i);
+///     if (final_pass) {
+///       m_y(i) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> m_x;
+///   Kokkos::View<value_type*, execution_space> m_y;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an <i>exclusive</i>
+/// scan of an array of \c int, in place.  In operator(), note both
+/// that the final_pass test and the update have switched places, and
+/// the use of a temporary.  If given an array [1, 2, 3, 4], this scan
+/// will overwrite that array with [0, 1, 3, 6].
+///
+/// \code
+/// template<class SpaceType>
+/// class ExclScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   ExclScanFunctor (Kokkos::View<value_type*, execution_space> x) : x_ (x) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     const value_type x_i = x_(i);
+///     if (final_pass) {
+///       x_(i) = update;
+///     }
+///     update += x_i;
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> x_;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which builds on the above
+/// exclusive scan example, to compute an offsets array from a
+/// population count array, in place.  We assume that the pop count
+/// array has an extra entry at the end to store the final count.  If
+/// given an array [1, 2, 3, 4, 0], this scan will overwrite that
+/// array with [0, 1, 3, 6, 10].
+///
+/// \code
+/// template<class SpaceType>
+/// class OffsetScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   // lastIndex_ is the last valid index (zero-based) of x.
+///   // If x has length zero, then lastIndex_ won't be used anyway.
+///   OffsetScanFunctor( Kokkos::View<value_type*, execution_space> x
+///                    , Kokkos::View<value_type*, execution_space> y )
+///      : m_x(x), m_y(y), last_index_ (x.dimension_0 () == 0 ? 0 : x.dimension_0 () - 1)
+///   {}
+///
+///   void operator () (const size_type i, int& update, const bool final_pass) const {
+///     if (final_pass) {
+///       m_y(i) = update;
+///     }
+///     update += m_x(i);
+///     // The last entry of m_y gets the final sum.
+///     if (final_pass && i == last_index_) {
+///       m_y(i+1) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> m_x;
+///   Kokkos::View<value_type*, execution_space> m_y;
+///   const size_type last_index_;
+/// };
+/// \endcode
+///
+template< class ExecutionPolicy , class FunctorType >
+inline
+void parallel_scan( const ExecutionPolicy & policy
+                  , const FunctorType     & functor
+                  , const std::string& str = ""
+                  , typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
+                  )
+{
+#if defined(KOKKOS_ENABLE_PROFILING)
+  uint64_t kpID = 0;
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecutionPolicy::work_tag> name(str);
+    Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID);
+  }
+#endif
+
+  Kokkos::Impl::shared_allocation_tracking_disable();
+  Impl::ParallelScan< FunctorType , ExecutionPolicy > closure( functor , policy );
+  Kokkos::Impl::shared_allocation_tracking_enable();
+
+  closure.execute();
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::endParallelScan(kpID);
+  }
+#endif
+
+}
+
+template< class FunctorType >
+inline
+void parallel_scan( const size_t        work_count
+                  , const FunctorType & functor
+                  , const std::string& str = "" )
+{
+  typedef typename
+    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef Kokkos::RangePolicy< execution_space > policy ;
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+  uint64_t kpID = 0;
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str);
+    Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID);
+  }
+#endif
+
+  Kokkos::Impl::shared_allocation_tracking_disable();
+  Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) );
+  Kokkos::Impl::shared_allocation_tracking_enable();
+
+  closure.execute();
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::endParallelScan(kpID);
+  }
+#endif
+
+}
+
+template< class ExecutionPolicy , class FunctorType >
+inline
+void parallel_scan( const std::string& str
+                  , const ExecutionPolicy & policy
+                  , const FunctorType     & functor)
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
+  #endif
+
+  ::Kokkos::parallel_scan(policy,functor,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_scan kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Enable = void >
+struct FunctorTeamShmemSize
+{
+  KOKKOS_INLINE_FUNCTION static size_t value( const FunctorType & , int ) { return 0 ; }
+};
+
+template< class FunctorType >
+struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type >
+{
+  static inline size_t value( const FunctorType & f , int team_size ) { return f.team_shmem_size( team_size ) ; }
+};
+
+template< class FunctorType >
+struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::shmem_size ) >::type >
+{
+  static inline size_t value( const FunctorType & f , int team_size ) { return f.shmem_size( team_size ) ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_PARALLEL_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d122d5bdca2b154f0a3711f344ad421da2abf696
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -0,0 +1,1138 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_PARALLEL_REDUCE_HPP
+#define KOKKOS_PARALLEL_REDUCE_HPP
+
+#include <Kokkos_NumericTraits.hpp>
+
+namespace Kokkos {
+
+template<class T, class Enable = void>
+struct is_reducer_type {
+  enum { value = 0 };
+};
+
+
+template<class T>
+struct is_reducer_type<T,typename std::enable_if<
+                       std::is_same<typename std::remove_cv<T>::type,
+                                    typename std::remove_cv<typename T::reducer>::type>::value
+                      >::type> {
+  enum { value = 1 };
+};
+
+namespace Experimental {
+
+
+template<class Scalar, class Space>
+struct Sum {
+public:
+  //Required
+  typedef Sum reducer;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  Sum(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Sum(const result_view_type& value_): value(value_.data()) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = reduction_identity<value_type>::sum();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() const {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+
+template<class Scalar, class Space>
+struct Prod {
+public:
+  //Required
+  typedef Prod reducer;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  Prod(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Prod(const result_view_type& value_): value(value_.data()) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest *= src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest *= src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = reduction_identity<value_type>::prod();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() const {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+
+template<class Scalar, class Space>
+struct Min {
+public:
+  //Required
+  typedef Min reducer;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  Min(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Min(const result_view_type& value_): value(value_.data()) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src < dest )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src < dest )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = reduction_identity<value_type>::min();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() const {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+
+template<class Scalar, class Space>
+struct Max {
+public:
+  //Required
+  typedef Max reducer;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  Max(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Max(const result_view_type& value_): value(value_.data()) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src > dest )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src > dest )
+      dest = src;
+  }
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = reduction_identity<value_type>::max();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() const {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+
+template<class Scalar, class Space>
+struct LAnd {
+public:
+  //Required
+  typedef LAnd reducer;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  LAnd(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  LAnd(const result_view_type& value_): value(value_.data()) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest = dest && src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest && src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = reduction_identity<value_type>::land();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() const {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+
+template<class Scalar, class Space>
+struct LOr {
+public:
+  //Required
+  typedef LOr reducer;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  LOr(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  LOr(const result_view_type& value_): value(value_.data()) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest = dest || src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest || src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = reduction_identity<value_type>::lor();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() const {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+
+template<class Scalar, class Space>
+struct BAnd {
+public:
+  //Required
+  typedef BAnd reducer;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  BAnd(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  BAnd(const result_view_type& value_): value(value_.data()) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+      dest = dest & src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest & src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = reduction_identity<value_type>::band();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() const {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+
+template<class Scalar, class Space>
+struct BOr {
+public:
+  //Required
+  typedef BOr reducer;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  BOr(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  BOr(const result_view_type& value_): value(value_.data()) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+      dest = dest | src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest | src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = reduction_identity<value_type>::bor();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() const {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+
+template<class Scalar, class Index>
+struct ValLocScalar {
+  Scalar val;
+  Index loc;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const ValLocScalar& rhs) {
+    val = rhs.val;
+    loc = rhs.loc;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const volatile ValLocScalar& rhs) volatile {
+    val = rhs.val;
+    loc = rhs.loc;
+  }
+};
+
+template<class Scalar, class Index, class Space>
+struct MinLoc {
+private:
+  typedef typename std::remove_cv<Scalar>::type scalar_type;
+  typedef typename std::remove_cv<Index>::type index_type;
+
+public:
+  //Required
+  typedef MinLoc reducer;
+  typedef ValLocScalar<scalar_type,index_type> value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  MinLoc(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  MinLoc(const result_view_type& value_): value(value_.data()) {}
+
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src.val < dest.val )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src.val < dest.val )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val.val = reduction_identity<scalar_type>::min();
+    val.loc = reduction_identity<index_type>::min();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+
+template<class Scalar, class Index, class Space>
+struct MaxLoc {
+private:
+  typedef typename std::remove_cv<Scalar>::type scalar_type;
+  typedef typename std::remove_cv<Index>::type index_type;
+
+public:
+  //Required
+  typedef MaxLoc reducer;
+  typedef ValLocScalar<scalar_type,index_type> value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  MaxLoc(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  MaxLoc(const result_view_type& value_): value(value_.data()) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src.val > dest.val )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src.val > dest.val )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val.val = reduction_identity<scalar_type>::max();;
+    val.loc = reduction_identity<index_type>::min();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+
+template<class Scalar>
+struct MinMaxScalar {
+  Scalar min_val,max_val;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const MinMaxScalar& rhs) {
+    min_val = rhs.min_val;
+    max_val = rhs.max_val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const volatile MinMaxScalar& rhs) volatile {
+    min_val = rhs.min_val;
+    max_val = rhs.max_val;
+  }
+};
+
+template<class Scalar, class Space>
+struct MinMax {
+private:
+  typedef typename std::remove_cv<Scalar>::type scalar_type;
+
+public:
+  //Required
+  typedef MinMax reducer;
+  typedef MinMaxScalar<scalar_type> value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  MinMax(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  MinMax(const result_view_type& value_): value(value_.data()) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src.min_val < dest.min_val ) {
+      dest.min_val = src.min_val;
+    }
+    if ( src.max_val > dest.max_val ) {
+      dest.max_val = src.max_val;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src.min_val < dest.min_val ) {
+      dest.min_val = src.min_val;
+    }
+    if ( src.max_val > dest.max_val ) {
+      dest.max_val = src.max_val;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val.max_val = reduction_identity<scalar_type>::max();;
+    val.min_val = reduction_identity<scalar_type>::min();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+
+template<class Scalar, class Index>
+struct MinMaxLocScalar {
+  Scalar min_val,max_val;
+  Index min_loc,max_loc;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const MinMaxLocScalar& rhs) {
+    min_val = rhs.min_val;
+    min_loc = rhs.min_loc;
+    max_val = rhs.max_val;
+    max_loc = rhs.max_loc;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const volatile MinMaxLocScalar& rhs) volatile {
+    min_val = rhs.min_val;
+    min_loc = rhs.min_loc;
+    max_val = rhs.max_val;
+    max_loc = rhs.max_loc;
+  }
+};
+
+template<class Scalar, class Index, class Space>
+struct MinMaxLoc {
+private:
+  typedef typename std::remove_cv<Scalar>::type scalar_type;
+  typedef typename std::remove_cv<Index>::type index_type;
+
+public:
+  //Required
+  typedef MinMaxLoc reducer;
+  typedef MinMaxLocScalar<scalar_type,index_type> value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  value_type* value;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  MinMaxLoc(value_type& value_): value(&value_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  MinMaxLoc(const result_view_type& value_): value(value_.data()) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src.min_val < dest.min_val ) {
+      dest.min_val = src.min_val;
+      dest.min_loc = src.min_loc;
+    }
+    if ( src.max_val > dest.max_val ) {
+      dest.max_val = src.max_val;
+      dest.max_loc = src.max_loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src.min_val < dest.min_val ) {
+      dest.min_val = src.min_val;
+      dest.min_loc = src.min_loc;
+    }
+    if ( src.max_val > dest.max_val ) {
+      dest.max_val = src.max_val;
+      dest.max_loc = src.max_loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val.max_val = reduction_identity<scalar_type>::max();;
+    val.min_val = reduction_identity<scalar_type>::min();
+    val.max_loc = reduction_identity<index_type>::min();
+    val.min_loc = reduction_identity<index_type>::min();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() {
+    return *value;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result_view_type(value);
+  }
+};
+}
+}
+
+
+namespace Kokkos {
+namespace Impl {
+
+template< class T, class ReturnType , class ValueTraits>
+struct ParallelReduceReturnValue;
+
+template< class ReturnType , class FunctorType >
+struct ParallelReduceReturnValue<typename std::enable_if<Kokkos::is_view<ReturnType>::value>::type, ReturnType, FunctorType> {
+  typedef ReturnType return_type;
+  typedef InvalidType reducer_type;
+
+  typedef typename return_type::value_type value_type_scalar;
+  typedef typename return_type::value_type* const value_type_array;
+
+  typedef typename if_c<return_type::rank==0,value_type_scalar,value_type_array>::type value_type;
+
+  static return_type& return_value(ReturnType& return_val, const FunctorType&) {
+    return return_val;
+  }
+};
+
+template< class ReturnType , class FunctorType>
+struct ParallelReduceReturnValue<typename std::enable_if<
+                                   !Kokkos::is_view<ReturnType>::value &&
+                                  (!std::is_array<ReturnType>::value && !std::is_pointer<ReturnType>::value) &&
+                                   !Kokkos::is_reducer_type<ReturnType>::value
+                                 >::type, ReturnType, FunctorType> {
+  typedef Kokkos::View<  ReturnType
+                       , Kokkos::HostSpace
+                       , Kokkos::MemoryUnmanaged
+      > return_type;
+
+  typedef InvalidType reducer_type;
+
+  typedef typename return_type::value_type value_type;
+
+  static return_type return_value(ReturnType& return_val, const FunctorType&) {
+    return return_type(&return_val);
+  }
+};
+
+template< class ReturnType , class FunctorType>
+struct ParallelReduceReturnValue<typename std::enable_if<
+                                  (is_array<ReturnType>::value || std::is_pointer<ReturnType>::value)
+                                >::type, ReturnType, FunctorType> {
+  typedef Kokkos::View<  typename std::remove_const<ReturnType>::type
+                       , Kokkos::HostSpace
+                       , Kokkos::MemoryUnmanaged
+      > return_type;
+
+  typedef InvalidType reducer_type;
+
+  typedef typename return_type::value_type value_type[];
+
+  static return_type return_value(ReturnType& return_val,
+                                  const FunctorType& functor) {
+    return return_type(return_val,functor.value_count);
+  }
+};
+
+template< class ReturnType , class FunctorType>
+struct ParallelReduceReturnValue<typename std::enable_if<
+                                   Kokkos::is_reducer_type<ReturnType>::value
+                                >::type, ReturnType, FunctorType> {
+  typedef ReturnType return_type;
+  typedef ReturnType reducer_type;
+  typedef typename return_type::value_type value_type;
+
+  static return_type return_value(ReturnType& return_val,
+                                  const FunctorType& functor) {
+    return return_val;
+  }
+};
+}
+
+namespace Impl {
+template< class T, class ReturnType , class FunctorType>
+struct ParallelReducePolicyType;
+
+template< class PolicyType , class FunctorType >
+struct ParallelReducePolicyType<typename std::enable_if<Kokkos::Impl::is_execution_policy<PolicyType>::value>::type, PolicyType,FunctorType> {
+
+  typedef PolicyType policy_type;
+  static PolicyType policy(const PolicyType& policy_) {
+    return policy_;
+  }
+};
+
+template< class PolicyType , class FunctorType >
+struct ParallelReducePolicyType<typename std::enable_if<std::is_integral<PolicyType>::value>::type, PolicyType,FunctorType> {
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef Kokkos::RangePolicy<execution_space> policy_type;
+
+  static policy_type policy(const PolicyType& policy_) {
+    return policy_type(0,policy_);
+  }
+};
+
+}
+
+namespace Impl {
+  template< class FunctorType, class ExecPolicy, class ValueType, class ExecutionSpace>
+  struct ParallelReduceFunctorType {
+    typedef FunctorType functor_type;
+    static const functor_type& functor(const functor_type& functor) {
+      return functor;
+    }
+  };
+}
+
+namespace Impl {
+
+  template< class PolicyType, class FunctorType, class ReturnType >
+  struct ParallelReduceAdaptor {
+    typedef Impl::ParallelReduceReturnValue<void,ReturnType,FunctorType> return_value_adapter;
+    #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
+    typedef Impl::ParallelReduceFunctorType<FunctorType,PolicyType,
+                                            typename return_value_adapter::value_type,
+                                            typename PolicyType::execution_space> functor_adaptor;
+    #endif
+    static inline
+    void execute(const std::string& label,
+        const PolicyType& policy,
+        const FunctorType& functor,
+        ReturnType& return_value) {
+          #if defined(KOKKOS_ENABLE_PROFILING)
+          uint64_t kpID = 0;
+          if(Kokkos::Profiling::profileLibraryLoaded()) {
+            Kokkos::Impl::ParallelConstructName<FunctorType, typename PolicyType::work_tag> name(label);
+            Kokkos::Profiling::beginParallelReduce(name.get(), 0, &kpID);
+          }
+          #endif
+
+          Kokkos::Impl::shared_allocation_tracking_disable();
+          #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
+          Impl::ParallelReduce<typename functor_adaptor::functor_type, PolicyType, typename return_value_adapter::reducer_type >
+             closure(functor_adaptor::functor(functor),
+                     policy,
+                     return_value_adapter::return_value(return_value,functor));
+          #else
+          Impl::ParallelReduce<FunctorType, PolicyType, typename return_value_adapter::reducer_type >
+             closure(functor,
+                     policy,
+                     return_value_adapter::return_value(return_value,functor));
+          #endif
+          Kokkos::Impl::shared_allocation_tracking_enable();
+          closure.execute();
+
+          #if defined(KOKKOS_ENABLE_PROFILING)
+          if(Kokkos::Profiling::profileLibraryLoaded()) {
+            Kokkos::Profiling::endParallelReduce(kpID);
+          }
+          #endif
+        }
+
+  };
+}
+
+//----------------------------------------------------------------------------
+
+/*! \fn void parallel_reduce(label,policy,functor,return_argument)
+    \brief Perform a parallel reduction.
+    \param label An optional Label giving the call name. Must be able to construct a std::string from the argument.
+    \param policy A Kokkos Execution Policy, such as an integer, a RangePolicy or a TeamPolicy.
+    \param functor A functor with a reduction operator, and optional init, join and final functions.
+    \param return_argument A return argument which can be a scalar, a View, or a ReducerStruct. This argument can be left out if the functor has a final function.
+*/
+
+/** \brief  Parallel reduction
+ *
+ * parallel_reduce performs parallel reductions with arbitrary functions - i.e.
+ * it is not solely data based. The call expects up to 4 arguments:
+ *
+ *
+ * Example of a parallel_reduce functor for a POD (plain old data) value type:
+ * \code
+ *  class FunctorType { // For POD value type
+ *  public:
+ *    typedef    ...     execution_space ;
+ *    typedef <podType>  value_type ;
+ *    void operator()( <intType> iwork , <podType> & update ) const ;
+ *    void init( <podType> & update ) const ;
+ *    void join( volatile       <podType> & update ,
+ *               volatile const <podType> & input ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> & update ) const ;
+ *  };
+ * \endcode
+ *
+ * Example of a parallel_reduce functor for an array of POD (plain old data) values:
+ * \code
+ *  class FunctorType { // For array of POD value
+ *  public:
+ *    typedef    ...     execution_space ;
+ *    typedef <podType>  value_type[] ;
+ *    void operator()( <intType> , <podType> update[] ) const ;
+ *    void init( <podType> update[] ) const ;
+ *    void join( volatile       <podType> update[] ,
+ *               volatile const <podType> input[] ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> update[] ) const ;
+ *  };
+ * \endcode
+ */
+
+// ReturnValue is scalar or array: take by reference
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const PolicyType& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute(label,policy,functor,return_value);
+}
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const PolicyType& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute("",policy,functor,return_value);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const size_t& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,ReturnType>::execute("",policy_type(0,policy),functor,return_value);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const size_t& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,ReturnType>::execute(label,policy_type(0,policy),functor,return_value);
+}
+
+// ReturnValue as View or Reducer: take by copy to allow for inline construction
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const PolicyType& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,const ReturnType>::execute(label,policy,functor,return_value);
+}
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const PolicyType& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  ReturnType return_value_impl = return_value;
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute("",policy,functor,return_value_impl);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const size_t& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  ReturnType return_value_impl = return_value;
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,ReturnType>::execute("",policy_type(0,policy),functor,return_value_impl);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const size_t& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  ReturnType return_value_impl = return_value;
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,ReturnType>::execute(label,policy_type(0,policy),functor,return_value_impl);
+}
+
+// No Return Argument
+
+template< class PolicyType, class FunctorType>
+inline
+void parallel_reduce(const std::string& label,
+                     const PolicyType& policy,
+                     const FunctorType& functor,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,result_view_type>::execute(label,policy,functor,result_view);
+}
+
+template< class PolicyType, class FunctorType >
+inline
+void parallel_reduce(const PolicyType& policy,
+                     const FunctorType& functor,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,result_view_type>::execute("",policy,functor,result_view);
+}
+
+template< class FunctorType >
+inline
+void parallel_reduce(const size_t& policy,
+                     const FunctorType& functor) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,result_view_type>::execute("",policy_type(0,policy),functor,result_view);
+}
+
+template< class FunctorType>
+inline
+void parallel_reduce(const std::string& label,
+                     const size_t& policy,
+                     const FunctorType& functor) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,result_view_type>::execute(label,policy_type(0,policy),functor,result_view);
+}
+
+} //namespace Kokkos
+
+#endif // KOKKOS_PARALLEL_REDUCE_HPP
+
diff --git a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b3fd3af70b25e6971aa0e1ea7d375c4822571cb7
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp
@@ -0,0 +1,111 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+ */
+
+#ifndef KOKKOSP_PROFILE_SECTION_HPP
+#define KOKKOSP_PROFILE_SECTION_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+#include <string>
+
+namespace Kokkos {
+namespace Profiling {
+
+class ProfilingSection {
+
+public:
+	ProfilingSection(const std::string& sectionName) :
+		secName(sectionName) {
+
+		#if defined( KOKKOS_ENABLE_PROFILING )
+			if(Kokkos::Profiling::profileLibraryLoaded()) {
+				Kokkos::Profiling::createProfileSection(secName, &secID);
+			}
+		#else
+			secID = 0;
+		#endif
+	}
+	
+	void start() {
+		#if defined( KOKKOS_ENABLE_PROFILING )
+			if(Kokkos::Profiling::profileLibraryLoaded()) {
+				Kokkos::Profiling::startSection(secID);
+			}
+		#endif
+	}
+	
+	void stop() {
+		#if defined( KOKKOS_ENABLE_PROFILING )
+			if(Kokkos::Profiling::profileLibraryLoaded()) {
+				Kokkos::Profiling::stopSection(secID);
+			}
+		#endif
+	}
+	
+	~ProfilingSection() {
+		#if defined( KOKKOS_ENABLE_PROFILING )
+			if(Kokkos::Profiling::profileLibraryLoaded()) {
+				Kokkos::Profiling::destroyProfileSection(secID);
+			}
+		#endif
+	}
+	
+	std::string getName() {
+		return secName;
+	}
+	
+	uint32_t getSectionID() {
+		return secID;
+	}
+	
+protected:
+	const std::string secName;
+	uint32_t secID;
+
+};
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/packages/kokkos/core/src/Kokkos_Qthreads.hpp b/packages/kokkos/core/src/Kokkos_Qthreads.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..16702f864736a17bd9c77980d06de949d50d688b
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Qthreads.hpp
@@ -0,0 +1,201 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADS_HPP
+#define KOKKOS_QTHREADS_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS )
+
+#include <Kokkos_Core_fwd.hpp>
+
+// Defines to enable experimental Qthreads functionality.
+#define QTHREAD_LOCAL_PRIORITY
+#define CLONED_TASKS
+
+#include <qthread.h>
+
+#include <cstddef>
+#include <iosfwd>
+
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+//#include <Kokkos_MemoryTraits.hpp>
+//#include <Kokkos_ExecPolicy.hpp>
+//#include <Kokkos_TaskScheduler.hpp> // Uncomment when Tasking working.
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <KokkosExp_MDRangePolicy.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+namespace Impl {
+
+class QthreadsExec;
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Execution space supported by Qthreads */
+class Qthreads {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as an execution space
+  typedef Qthreads                 execution_space;
+  typedef Kokkos::HostSpace        memory_space;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device< execution_space, memory_space > device_type;
+
+  typedef Kokkos::LayoutRight      array_layout;
+  typedef memory_space::size_type  size_type;
+
+  typedef ScratchMemorySpace< Qthreads > scratch_memory_space;
+
+  //@}
+  /*------------------------------------------------------------------------*/
+
+  /** \brief  Initialization will construct one or more instances */
+  static Qthreads & instance( int = 0 );
+
+  /** \brief  Set the execution space to a "sleep" state.
+   *
+   * This function sets the "sleep" state in which it is not ready for work.
+   * This may consume less resources than in an "ready" state,
+   * but it may also take time to transition to the "ready" state.
+   *
+   * \return True if enters or is in the "sleep" state.
+   *         False if functions are currently executing.
+   */
+  bool sleep();
+
+  /** \brief  Wake from the sleep state.
+   *
+   *  \return True if enters or is in the "ready" state.
+   *          False if functions are currently executing.
+   */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functions to complete.
+   *
+   *  The parallel_for or parallel_reduce dispatch of a functor may
+   *  return asynchronously, before the functor completes.  This
+   *  method does not return until all dispatched functors on this
+   *  device have completed.
+   */
+  static void fence();
+
+  /*------------------------------------------------------------------------*/
+
+  static int in_parallel();
+
+  static int is_initialized();
+
+  /** \brief  Return maximum amount of concurrency */
+  static int concurrency();
+
+  static void initialize( int thread_count );
+  static void finalize();
+
+  /** \brief Print configuration information to the given output stream. */
+  static void print_configuration( std::ostream &, const bool detail = false );
+
+  int shepherd_size() const;
+  int shepherd_worker_size() const;
+
+  static const char* name();
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<>
+struct MemorySpaceAccess
+  < Kokkos::Qthreads::memory_space
+  , Kokkos::Qthreads::scratch_memory_space
+  >
+{
+  enum { assignable = false };
+  enum { accessible = true };
+  enum { deepcopy   = false };
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Qthreads::memory_space
+  , Kokkos::Qthreads::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) {}
+  inline static void verify( const void * ) {}
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+#include <Qthreads/Kokkos_QthreadsExec.hpp>
+#include <Qthreads/Kokkos_Qthreads_Parallel.hpp>
+//#include <Qthreads/Kokkos_Qthreads_Task.hpp> // Uncomment when Tasking working.
+//#include <Qthreads/Kokkos_Qthreads_TaskQueue.hpp> // Uncomment when Tasking working.
+
+#endif // #define KOKKOS_ENABLE_QTHREADS
+#endif // #define KOKKOS_QTHREADS_HPP
+
diff --git a/packages/kokkos/core/src/Kokkos_ROCm.hpp b/packages/kokkos/core/src/Kokkos_ROCm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..469d6b27878cc5603313f11c835f765dfcda0b3b
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_ROCm.hpp
@@ -0,0 +1,248 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ROCM_HPP
+#define KOKKOS_ROCM_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ENABLE_ROCM )
+
+class dim3 {
+public:
+int x,y,z;
+dim3(int _x, int _y, int _z):x(_x),y(_y),z(_z) {};
+};
+
+#include <ROCm/hc_math_std.hpp>
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ROCmSpace.hpp>
+#include <ROCm/Kokkos_ROCm_Exec.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#include <hc.hpp>
+#include <hc_am.hpp>
+#include <amp_math.h>
+
+#if defined( __HCC_ACCELERATOR__ )
+
+using namespace ::Concurrency::precise_math ;
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class ROCmExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+/// \class ROCm
+/// \brief Kokkos device for multicore processors in the host memory space.
+class ROCm {
+public:
+  //------------------------------------
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as a kokkos execution space
+  typedef ROCm                  execution_space ;
+  typedef ROCmSpace             memory_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef LayoutLeft            array_layout ;
+  typedef HostSpace::size_type  size_type ;
+
+  typedef ScratchMemorySpace< ROCm > scratch_memory_space ;
+
+  ~ROCm() {}
+  ROCm();
+//  explicit ROCm( const int instance_id );
+
+  ROCm( ROCm && ) = default ;
+  ROCm( const ROCm & ) = default ;
+  ROCm & operator = ( ROCm && ) = default ;
+  ROCm & operator = ( const ROCm & ) = default ;
+
+
+  //@}
+  //------------------------------------
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  KOKKOS_INLINE_FUNCTION static int in_parallel() {
+#if defined( __HCC_ACCELERATOR__ )
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  /** \brief  Set the device in a "sleep" state. */
+  static bool sleep() ;
+
+  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
+  static bool wake() ;
+
+  /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
+  static void fence() ;
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  /// \brief Free any resources being consumed by the device.
+  static void finalize() ;
+
+  /** \brief  Initialize the device.
+   *
+   */
+  struct SelectDevice {
+    int rocm_device_id ;
+    SelectDevice() : rocm_device_id(1) {}
+    explicit SelectDevice( int id ) : rocm_device_id( id+1 ) {}
+  };
+
+  int          rocm_device() const { return m_device ; }
+  bool         isAPU();
+  bool         isAPU(int device);
+
+  static void initialize( const SelectDevice = SelectDevice());
+
+  static int is_initialized();
+
+//  static size_type device_arch();
+
+//  static size_type detect_device_count();
+
+
+  static int concurrency() ;
+  static const char* name();
+private:
+  int          m_device ;
+
+};
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct MemorySpaceAccess
+  < Kokkos::Experimental::ROCmSpace
+  , Kokkos::Experimental::ROCm::scratch_memory_space
+  >
+{
+  enum { assignable = false };
+  enum { accessible = true };
+  enum { deepcopy   = false };
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Experimental::ROCm::memory_space
+  , Kokkos::Experimental::ROCm::scratch_memory_space
+  >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::HostSpace
+  , Kokkos::Experimental::ROCm::scratch_memory_space
+  >
+{
+  enum { value = false };
+  inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); }
+  inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); }
+};
+
+} // namespace Experimental
+} // namespace Kokkos
+
+
+
+
+
+#define threadIdx_x (hc_get_workitem_id(0))
+#define threadIdx_y (hc_get_workitem_id(1))
+#define threadIdx_z (hc_get_workitem_id(2))
+
+#define blockIdx_x  (hc_get_group_id(0))
+#define blockIdx_y  (hc_get_group_id(1))
+#define blockIdx_z  (hc_get_group_id(2))
+
+#define blockDim_x  (hc_get_group_size(0))
+#define blockDim_y  (hc_get_group_size(1))
+#define blockDim_z  (hc_get_group_size(2))
+
+#define gridDim_x   (hc_get_num_groups(0))
+#define gridDim_y   (hc_get_num_groups(1))
+#define gridDim_z   (hc_get_num_groups(2))
+
+
+#include <ROCm/Kokkos_ROCm_Parallel.hpp>
+#include <ROCm/Kokkos_ROCm_Task.hpp>
+
+#endif
+#endif
+
+
diff --git a/packages/kokkos/core/src/Kokkos_ROCmSpace.hpp b/packages/kokkos/core/src/Kokkos_ROCmSpace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b98ab1b74b29c1eb7644bce38c9a49742552303
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_ROCmSpace.hpp
@@ -0,0 +1,622 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ROCMSPACE_HPP
+#define KOKKOS_ROCMSPACE_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ENABLE_ROCM )
+
+#include <iosfwd>
+#include <typeinfo>
+#include <string>
+
+#include <Kokkos_HostSpace.hpp>
+
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+/** \brief  ROCm on-device memory management */
+
+class ROCmSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef ROCmSpace             memory_space ;
+  typedef Kokkos::Experimental::ROCm          execution_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef unsigned int          size_type ;
+
+  /*--------------------------------*/
+
+  ROCmSpace();
+  ROCmSpace( ROCmSpace && rhs ) = default ;
+  ROCmSpace( const ROCmSpace & rhs ) = default ;
+  ROCmSpace & operator = ( ROCmSpace && rhs ) = default ;
+  ROCmSpace & operator = ( const ROCmSpace & rhs ) = default ;
+  ~ROCmSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the rocm space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the rocm space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /**\brief Return Name of the MemorySpace */
+  static constexpr const char* name() { return m_name; };
+
+  /*--------------------------------*/
+  /** \brief  Error reporting for HostSpace attempt to access ROCmSpace */
+  static void access_error();
+  static void access_error( const void * const );
+
+private:
+
+  int  m_device ; ///< Which ROCm device
+
+  static constexpr const char* m_name = "ROCm";
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > ;
+};
+
+} // namespace Experimental
+
+namespace Impl {
+
+void * rocm_device_allocate(int);
+void * rocm_hostpinned_allocate(int);
+void rocm_device_free(void * );
+
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_arrays_rocm_space();
+
+/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* atomic_lock_array_rocm_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
+///
+/// Team and Thread private scratch allocations in
+/// global memory are aquired via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* scratch_lock_array_rocm_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for unique identifiers.
+///
+/// Unique identifiers in the range 0-ROCm::concurrency
+/// are provided via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* threadid_lock_array_rocm_space_ptr(bool deallocate = false);
+}
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+
+namespace Kokkos {
+namespace Experimental {
+/** \brief  Host memory that is accessible to ROCm execution space
+ *          through ROCm's host-pinned memory allocation.
+ */
+class ROCmHostPinnedSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  /** \brief  Memory is in HostSpace so use the HostSpace::execution_space */
+  typedef HostSpace::execution_space  execution_space ;
+  typedef ROCmHostPinnedSpace         memory_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef unsigned int                size_type ;
+
+  /*--------------------------------*/
+
+  ROCmHostPinnedSpace();
+  ROCmHostPinnedSpace( ROCmHostPinnedSpace && rhs ) = default ;
+  ROCmHostPinnedSpace( const ROCmHostPinnedSpace & rhs ) = default ;
+  ROCmHostPinnedSpace & operator = ( ROCmHostPinnedSpace && rhs ) = default ;
+  ROCmHostPinnedSpace & operator = ( const ROCmHostPinnedSpace & rhs ) = default ;
+  ~ROCmHostPinnedSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /**\brief Return Name of the MemorySpace */
+  static constexpr const char* name() { return m_name; };
+
+private:
+
+  static constexpr const char* m_name = "ROCmHostPinned";
+
+  /*--------------------------------*/
+};
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace >::assignable , "" );
+
+//----------------------------------------
+
+template<>
+struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace > {
+  enum { assignable = false };
+  enum { accessible = false };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace > {
+  // HostSpace::execution_space == ROCmHostPinnedSpace::execution_space
+  enum { assignable = true };
+  enum { accessible = true };
+  enum { deepcopy   = true };
+};
+
+//----------------------------------------
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::HostSpace > {
+  enum { assignable = false };
+  enum { accessible = false };
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace > {
+  // ROCmSpace::execution_space != ROCmHostPinnedSpace::execution_space
+  enum { assignable = false };
+  enum { accessible = true }; // ROCmSpace::execution_space
+  enum { deepcopy   = true };
+};
+
+
+//----------------------------------------
+// ROCmHostPinnedSpace::execution_space == HostSpace::execution_space
+// ROCmHostPinnedSpace accessible to both ROCm and Host
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::HostSpace > {
+  enum { assignable = false }; // Cannot access from ROCm
+  enum { accessible = true };  // ROCmHostPinnedSpace::execution_space
+  enum { deepcopy   = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmSpace > {
+  enum { assignable = false }; // Cannot access from Host
+  enum { accessible = false };
+  enum { deepcopy   = true };
+};
+
+};
+//----------------------------------------
+
+} // namespace Kokkos::Impl
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+hc::completion_future DeepCopyAsyncROCm( void * dst , const void * src , size_t n);
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm>
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+    fut.wait();
+//    DeepCopy (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm>( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopy (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopy (dst,src,n);
+  }
+};
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm>
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , Kokkos::Experimental::ROCm >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace>
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+    fut.wait();
+//    DeepCopyROCm (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+    fut.wait();
+//    DeepCopyROCm (dst,src,n);
+  }
+};
+
+
+
+template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+//    hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+//    fut.wait();
+//    DeepCopyAsyncROCm (dst,src,n);
+    DeepCopy (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , Kokkos::Experimental::ROCm>( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopy (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopy (dst,src,n);
+  }
+};
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** Running in ROCmSpace attempting to access HostSpace: error */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::ROCmSpace , Kokkos::HostSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("ROCm code attempted to access HostSpace memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("ROCm code attempted to access HostSpace memory"); }
+};
+
+/** Running in ROCmSpace accessing ROCmHostPinnedSpace: ok */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+/** Running in ROCmSpace attempting to access an unknown space: error */
+template< class OtherSpace >
+struct VerifyExecutionCanAccessMemorySpace<
+  typename enable_if< ! is_same<Kokkos::Experimental::ROCmSpace,OtherSpace>::value , Kokkos::Experimental::ROCmSpace >::type ,
+  OtherSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("ROCm code attempted to access unknown Space memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("ROCm code attempted to access unknown Space memory"); }
+};
+
+//----------------------------------------------------------------------------
+/** Running in HostSpace attempting to access ROCmSpace */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace >
+{
+  enum { value = false };
+  inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); }
+  inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); }
+};
+
+/** Running in HostSpace accessing ROCmHostPinnedSpace is OK */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) {}
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
+};
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  const Kokkos::Experimental::ROCmSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+
+  SharedAllocationRecord( const Kokkos::Experimental::ROCmSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::Experimental::ROCmSpace &  arg_space
+                                          , const std::string       &  arg_label
+                                          , const size_t               arg_alloc_size );
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::Experimental::ROCmSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::Experimental::ROCmSpace & , bool detail = false );
+};
+
+template<>
+class SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  const Kokkos::Experimental::ROCmHostPinnedSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::Experimental::ROCmHostPinnedSpace     & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::Experimental::ROCmHostPinnedSpace &  arg_space
+                                          , const std::string          &  arg_label
+                                          , const size_t                  arg_alloc_size
+                                          );
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::Experimental::ROCmHostPinnedSpace & , bool detail = false );
+};
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_ROCM ) */
+#endif /* #define KOKKOS_ROCMSPACE_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4527dd4c14586e2c451d0658a5c774bf0da286a4
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -0,0 +1,163 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SCRATCHSPACE_HPP
+#define KOKKOS_SCRATCHSPACE_HPP
+
+#include <cstdio>
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Concepts.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Scratch memory space associated with an execution space.
+ *
+ */
+template< class ExecSpace >
+class ScratchMemorySpace {
+  static_assert (is_execution_space<ExecSpace>::value,"Instantiating ScratchMemorySpace on non-execution-space type.");
+public:
+
+  // Alignment of memory chunks returned by 'get'
+  // must be a power of two
+  enum { ALIGN = 8 };
+
+private:
+
+  mutable char * m_iter_L0 ;
+  char *         m_end_L0 ;
+  mutable char * m_iter_L1 ;
+  char *         m_end_L1 ;
+
+
+  mutable int m_multiplier;
+  mutable int m_offset;
+  mutable int m_default_level;
+
+  ScratchMemorySpace();
+  ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
+
+  enum { MASK = ALIGN - 1 }; // Alignment used by View::shmem_size
+
+public:
+
+  //! Tag this class as a memory space
+  typedef ScratchMemorySpace                memory_space ;
+  typedef ExecSpace                         execution_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef typename ExecSpace::array_layout  array_layout ;
+  typedef typename ExecSpace::size_type     size_type ;
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION static
+  IntType align( const IntType & size )
+    { return ( size + MASK ) & ~MASK ; }
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION
+  void* get_shmem (const IntType& size, int level = -1) const {
+    if(level == -1)
+      level = m_default_level;
+    if(level == 0) {
+      void* tmp = m_iter_L0 + m_offset * align (size);
+      if (m_end_L0 < (m_iter_L0 += align (size) * m_multiplier)) {
+        m_iter_L0 -= align (size) * m_multiplier; // put it back like it was
+        #ifdef KOKKOS_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L0-m_iter_L0));
+        #endif // KOKKOS_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+    } else {
+      void* tmp = m_iter_L1 + m_offset * align (size);
+      if (m_end_L1 < (m_iter_L1 += align (size) * m_multiplier)) {
+        m_iter_L1 -= align (size) * m_multiplier; // put it back like it was
+        #ifdef KOKKOS_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L1-m_iter_L1));
+        #endif // KOKKOS_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+
+    }
+  }
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION
+  ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)
+    : m_iter_L0( (char *) ptr_L0 )
+    , m_end_L0(  m_iter_L0 + size_L0 )
+    , m_iter_L1( (char *) ptr_L1 )
+    , m_end_L1(  m_iter_L1 + size_L1 )
+    , m_multiplier( 1 )
+    , m_offset( 0 )
+    , m_default_level( 0 )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  const ScratchMemorySpace& set_team_thread_mode(const int& level, const int& multiplier, const int& offset) const {
+    m_default_level = level;
+    m_multiplier = multiplier;
+    m_offset = offset;
+    return *this;
+  }
+};
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_Serial.hpp b/packages/kokkos/core/src/Kokkos_Serial.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..065b5a0c5efa7112f064fa6092e5c9362051d173
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Serial.hpp
@@ -0,0 +1,1083 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Serial.hpp
+/// \brief Declaration and definition of Kokkos::Serial device.
+
+#ifndef KOKKOS_SERIAL_HPP
+#define KOKKOS_SERIAL_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_SERIAL )
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskScheduler.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+#include <Kokkos_UniqueToken.hpp>
+
+namespace Kokkos {
+
+/// \class Serial
+/// \brief Kokkos device for non-parallel execution
+///
+/// A "device" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads device uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
+/// extensions, and the Cuda device uses NVIDIA's CUDA programming
+/// model.  The Serial device executes "parallel" kernels
+/// sequentially.  This is useful if you really do not want to use
+/// threads, or if you want to explore different combinations of MPI
+/// and shared-memory parallel programming models.
+class Serial {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as an execution space:
+  typedef Serial                execution_space ;
+  //! The size_type typedef best suited for this device.
+  typedef HostSpace::size_type  size_type ;
+  //! This device's preferred memory space.
+  typedef HostSpace             memory_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  //! This device's preferred array layout.
+  typedef LayoutRight           array_layout ;
+
+  /// \brief  Scratch memory space
+  typedef ScratchMemorySpace< Kokkos::Serial >  scratch_memory_space ;
+
+  //@}
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  ///
+  /// For the Serial device, this method <i>always</i> returns false,
+  /// because parallel_for or parallel_reduce with the Serial device
+  /// always execute sequentially.
+  inline static int in_parallel() { return false ; }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence() {}
+
+  static void initialize( unsigned threads_count = 1 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false);
+
+  static bool is_initialized();
+
+  /** \brief  Return the maximum amount of concurrency.  */
+  static int concurrency() {return 1;};
+
+  //! Free any resources being consumed by the device.
+  static void finalize();
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
+
+  //--------------------------------------------------------------------------
+
+  inline static int thread_pool_size( int = 0 ) { return 1 ; }
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+
+  static const char* name();
+  //--------------------------------------------------------------------------
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct MemorySpaceAccess
+  < Kokkos::Serial::memory_space
+  , Kokkos::Serial::scratch_memory_space
+  >
+{
+  enum { assignable = false };
+  enum { accessible = true };
+  enum { deepcopy   = false };
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Serial::memory_space
+  , Kokkos::Serial::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+// Resize thread team data scratch memory
+void serial_resize_thread_team_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes );
+
+HostThreadTeamData * serial_get_thread_team_data();
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+
+namespace Kokkos {
+namespace Impl {
+
+/*
+ * < Kokkos::Serial , WorkArgTag >
+ * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type >
+ *
+ */
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<Properties...>
+{
+private:
+
+  size_t m_team_scratch_size[2] ;
+  size_t m_thread_scratch_size[2] ;
+  int    m_league_size ;
+  int    m_chunk_size;
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+  //! Execution space of this execution policy:
+  typedef Kokkos::Serial  execution_space ;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  static
+  int team_size_max( const FunctorType & ) { return 1 ; }
+
+  template< class FunctorType >
+  static
+  int team_size_recommended( const FunctorType & ) { return 1 ; }
+
+  template< class FunctorType >
+  static
+  int team_size_recommended( const FunctorType & , const int& ) { return 1 ; }
+
+  //----------------------------------------
+
+  inline int team_size() const { return 1 ; }
+  inline int league_size() const { return m_league_size ; }
+  inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( execution_space &
+            , int league_size_request
+            , int /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_league_size( league_size_request )
+    , m_chunk_size ( 32 )
+    {}
+
+  TeamPolicyInternal( execution_space &
+            , int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_league_size( league_size_request )
+    , m_chunk_size ( 32 )
+    {}
+
+  TeamPolicyInternal( int league_size_request
+            , int /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_league_size( league_size_request )
+    , m_chunk_size ( 32 )
+    {}
+
+  TeamPolicyInternal( int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_league_size( league_size_request )
+    , m_chunk_size ( 32 )
+    {}
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  typedef Impl::HostThreadTeamMember< Kokkos::Serial >  member_type ;
+
+protected:
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal internal_set_chunk_size(typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerThreadValue& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) {
+    m_team_scratch_size[level] = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+};
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Parallel patterns for Kokkos::Serial with RangePolicy */
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::RangePolicy< Traits ... > ,
+                   Kokkos::Serial
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  const FunctorType m_functor ;
+  const Policy      m_policy ;
+
+  template< class TagType >
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec() const
+    {
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( i );
+      }
+    }
+
+  template< class TagType >
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec() const
+    {
+      const TagType t{} ;
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( t , i );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    { this-> template exec< typename Policy::work_tag >(); }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    {}
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class FunctorType , class ReducerType , class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Serial
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTagFwd >  ValueInit ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec( reference_type update ) const
+    {
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( i , update );
+      }
+    }
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec( reference_type update ) const
+    {
+      const TagType t{} ;
+
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( t , i , update );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      const size_t pool_reduce_size =
+        Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) );
+      const size_t team_reduce_size  = 0 ; // Never shrinks
+      const size_t team_shared_size  = 0 ; // Never shrinks
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      pointer_type ptr =
+        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+
+      reference_type update =
+        ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      this-> template exec< WorkTag >( update );
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.data() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Serial reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Serial reduce result must be a View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+};
+
+
+/*--------------------------------------------------------------------------*/
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::Serial
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
+
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec( reference_type update ) const
+    {
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( i , update , true );
+      }
+    }
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec( reference_type update ) const
+    {
+      const TagType t{} ;
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( t , i , update , true );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      const size_t pool_reduce_size = Analysis::value_size( m_functor );
+      const size_t team_reduce_size  = 0 ; // Never shrinks
+      const size_t team_shared_size  = 0 ; // Never shrinks
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      reference_type update =
+        ValueInit::init( m_functor , pointer_type(data.pool_reduce_local()) );
+
+      this-> template exec< WorkTag >( update );
+    }
+
+  inline
+  ParallelScan( const FunctorType & arg_functor
+              , const Policy      & arg_policy
+              )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Parallel patterns for Kokkos::Serial with MDRangePolicy */
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::MDRangePolicy< Traits ... > ,
+                   Kokkos::Serial
+                 >
+{
+private:
+
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy Policy ;
+
+  typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;
+
+  void
+  exec() const
+    {
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        iterate_type( m_mdr_policy, m_functor )( i );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    { this->exec(); }
+
+  inline
+  ParallelFor( const FunctorType   & arg_functor
+             , const MDRangePolicy & arg_policy )
+    : m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    {}
+};
+
+
+template< class FunctorType , class ReducerType , class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::MDRangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Serial
+                    >
+{
+private:
+
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy Policy ;
+
+  typedef typename MDRangePolicy::work_tag                                  WorkTag ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , MDRangePolicy , FunctorType > Analysis ;
+
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTagFwd >  ValueInit ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::value_type      value_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
+                                                             , FunctorType
+                                                             , WorkTag
+                                                             , reference_type
+                                                             >;
+
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  inline
+  void
+  exec( reference_type update ) const
+    {
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        iterate_type( m_mdr_policy, m_functor, update )( i );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      const size_t pool_reduce_size =
+        Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) );
+      const size_t team_reduce_size  = 0 ; // Never shrinks
+      const size_t team_shared_size  = 0 ; // Never shrinks
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      pointer_type ptr =
+        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+
+      reference_type update =
+        ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      this-> exec( update );
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const MDRangePolicy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.data() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Serial reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Serial reduce result must be a View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+};
+
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Parallel patterns for Kokkos::Serial with TeamPolicy */
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , Kokkos::TeamPolicy< Properties ... >
+                 , Kokkos::Serial
+                 >
+{
+private:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  typedef TeamPolicyInternal< Kokkos::Serial , Properties ...> Policy ;
+  typedef typename Policy::member_type                       Member ;
+
+  const FunctorType  m_functor ;
+  const int          m_league ;
+  const int          m_shared ;
+
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec( HostThreadTeamData & data ) const
+    {
+      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
+        m_functor( Member(data,ileague,m_league) );
+      }
+    }
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec( HostThreadTeamData & data ) const
+    {
+      const TagType t{} ;
+      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
+        m_functor( t , Member(data,ileague,m_league) );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      const size_t pool_reduce_size  = 0 ; // Never shrinks
+      const size_t team_reduce_size  = TEAM_REDUCE_SIZE ;
+      const size_t team_shared_size  = m_shared ;
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      this->template exec< typename Policy::work_tag >( data );
+    }
+
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_league(  arg_policy.league_size() )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
+    { }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class FunctorType , class ReducerType , class ... Properties >
+class ParallelReduce< FunctorType
+                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
+                    , Kokkos::Serial
+                    >
+{
+private:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
+  typedef typename Policy::member_type                       Member ;
+  typedef typename Policy::work_tag                          WorkTag ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTagFwd >  ValueInit ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const int          m_league ;
+  const ReducerType  m_reducer ;
+        pointer_type m_result_ptr ;
+  const int          m_shared ;
+
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec( HostThreadTeamData & data , reference_type update ) const
+    {
+      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
+        m_functor( Member(data,ileague,m_league) , update );
+      }
+    }
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec( HostThreadTeamData & data , reference_type update ) const
+    {
+      const TagType t{} ;
+
+      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
+        m_functor( t , Member(data,ileague,m_league) , update );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      const size_t pool_reduce_size  =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
+
+      const size_t team_reduce_size  = TEAM_REDUCE_SIZE ;
+      const size_t team_shared_size  = m_shared ;
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      pointer_type ptr =
+        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+
+      reference_type update =
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      this-> template exec< WorkTag >( data , update );
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+    }
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ViewType     & arg_result ,
+                typename std::enable_if<
+                  Kokkos::is_view< ViewType >::value &&
+                  !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_league( arg_policy.league_size() )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result.data() )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
+    {
+      static_assert( Kokkos::is_view< ViewType >::value
+        , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
+
+      static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_league(  arg_policy.league_size() )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos { namespace Experimental {
+
+template<>
+class UniqueToken< Serial, UniqueTokenScope::Instance>
+{
+public:
+  using execution_space = Serial;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return 1; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return 0; }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+template<>
+class UniqueToken< Serial, UniqueTokenScope::Global>
+{
+public:
+  using execution_space = Serial;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return 1; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return 0; }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+}} // namespace Kokkos::Experimental
+
+#include <impl/Kokkos_Serial_Task.hpp>
+
+#endif // defined( KOKKOS_ENABLE_SERIAL )
+#endif /* #define KOKKOS_SERIAL_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp b/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..00bceec2b5dba1f5d664831d3f7c86b625b6f4c3
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp
@@ -0,0 +1,47 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// For backward compatibility:
+
+#include <Kokkos_TaskScheduler.hpp>
+
diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3185853dc693a7263e1b9e649aad859f78e1121
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -0,0 +1,993 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TASKSCHEDULER_HPP
+#define KOKKOS_TASKSCHEDULER_HPP
+
+//----------------------------------------------------------------------------
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+#include <Kokkos_Core_fwd.hpp>
+//----------------------------------------------------------------------------
+
+#include <Kokkos_MemoryPool.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+// Forward declarations used in Impl::TaskQueue
+
+template< typename Arg1 = void , typename Arg2 = void >
+class Future ;
+
+template< typename Space >
+class TaskScheduler ;
+
+template< typename Space >
+void wait( TaskScheduler< Space > const & );
+
+template< typename Space >
+struct is_scheduler : public std::false_type {};
+
+template< typename Space >
+struct is_scheduler< TaskScheduler< Space > > : public std::true_type {};
+
+} // namespace Kokkos
+
+#include <impl/Kokkos_TaskQueue.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/*\brief  Implementation data for task data management, access, and execution.
+ *
+ *  CRTP Inheritance structure to allow static_cast from the
+ *  task root type and a task's FunctorType.
+ *
+ *    TaskBase< Space , ResultType , FunctorType >
+ *      : TaskBase< Space , ResultType , void >
+ *      , FunctorType
+ *      { ... };
+ *
+ *    TaskBase< Space , ResultType , void >
+ *      : TaskBase< Space , void , void >
+ *      { ... };
+ */
+template< typename Space , typename ResultType , typename FunctorType >
+class TaskBase ;
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/**
+ *
+ *  Future< space >  // value_type == void
+ *  Future< value >  // space == Default
+ *  Future< value , space >
+ *
+ */
+template< typename Arg1 , typename Arg2 >
+class Future {
+private:
+
+  template< typename > friend class TaskScheduler ;
+  template< typename , typename > friend class Future ;
+  template< typename , typename , typename > friend class Impl::TaskBase ;
+
+  enum { Arg1_is_space  = Kokkos::is_space< Arg1 >::value };
+  enum { Arg2_is_space  = Kokkos::is_space< Arg2 >::value };
+  enum { Arg1_is_value  = ! Arg1_is_space &&
+                          ! std::is_same< Arg1 , void >::value };
+  enum { Arg2_is_value  = ! Arg2_is_space &&
+                          ! std::is_same< Arg2 , void >::value };
+
+  static_assert( ! ( Arg1_is_space && Arg2_is_space )
+               , "Future cannot be given two spaces" );
+
+  static_assert( ! ( Arg1_is_value && Arg2_is_value )
+               , "Future cannot be given two value types" );
+
+  using ValueType =
+    typename std::conditional< Arg1_is_value , Arg1 ,
+    typename std::conditional< Arg2_is_value , Arg2 , void
+    >::type >::type ;
+
+  using Space =
+    typename std::conditional< Arg1_is_space , Arg1 ,
+    typename std::conditional< Arg2_is_space , Arg2 , void
+    >::type >::type ;
+
+  using task_base  = Impl::TaskBase< void , void , void > ;
+  using queue_type = Impl::TaskQueue< Space > ;
+
+  task_base * m_task ;
+
+  KOKKOS_INLINE_FUNCTION explicit
+  Future( task_base * task ) : m_task(0)
+    { if ( task ) queue_type::assign( & m_task , task ); }
+
+  //----------------------------------------
+
+public:
+
+  using execution_space = typename Space::execution_space ;
+  using value_type      = ValueType ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  bool is_null() const { return 0 == m_task ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int reference_count() const
+    { return 0 != m_task ? m_task->reference_count() : 0 ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void clear()
+    { if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  ~Future() { clear(); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr Future() noexcept : m_task(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Future( Future && rhs )
+    : m_task( rhs.m_task ) { rhs.m_task = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future & rhs )
+    : m_task(0)
+    { if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( Future && rhs )
+    {
+      clear();
+      m_task = rhs.m_task ;
+      rhs.m_task = 0 ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future & rhs )
+    {
+      if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+      return *this ;
+    }
+
+  //----------------------------------------
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( Future<A1,A2> && rhs )
+    : m_task( rhs.m_task )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      rhs.m_task = 0 ;
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future<A1,A2> & rhs )
+    : m_task(0)
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future<A1,A2> & rhs )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+      return *this ;
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( Future<A1,A2> && rhs )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      clear();
+      m_task = rhs.m_task ;
+      rhs.m_task = 0 ;
+      return *this ;
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int is_ready() const noexcept
+    { return ( 0 == m_task ) || ( ((task_base*) task_base::LockTag) == m_task->m_wait ); }
+
+  KOKKOS_INLINE_FUNCTION
+  const typename Impl::TaskResult< ValueType >::reference_type
+  get() const
+    {
+      if ( 0 == m_task ) {
+        Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
+      }
+      return Impl::TaskResult< ValueType >::get( m_task );
+    }
+};
+
+// Is a Future with the given execution space
+template< typename , typename ExecSpace = void >
+struct is_future : public std::false_type {};
+
+template< typename Arg1 , typename Arg2 , typename ExecSpace >
+struct is_future< Future<Arg1,Arg2> , ExecSpace >
+  : public std::integral_constant
+      < bool ,
+      ( std::is_same< ExecSpace , void >::value ||
+        std::is_same< ExecSpace
+                    , typename Future<Arg1,Arg2>::execution_space >::value )
+      > {};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+enum class TaskPriority : int { High    = 0
+                              , Regular = 1
+                              , Low     = 2 };
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< int TaskEnum , typename DepFutureType >
+struct TaskPolicyData
+{
+  using execution_space = typename DepFutureType::execution_space ;
+  using scheduler_type  = TaskScheduler< execution_space > ;
+
+  enum : int { m_task_type = TaskEnum };
+
+  scheduler_type const * m_scheduler ;
+  DepFutureType  const   m_dependence ;
+  int                    m_priority ;
+
+  TaskPolicyData() = delete ;
+  TaskPolicyData( TaskPolicyData && ) = default ;
+  TaskPolicyData( TaskPolicyData const & ) = default ;
+  TaskPolicyData & operator = ( TaskPolicyData && ) = default ;
+  TaskPolicyData & operator = ( TaskPolicyData const & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicyData( DepFutureType        const & arg_future
+                , Kokkos::TaskPriority const & arg_priority )
+    : m_scheduler( 0 )
+    , m_dependence( arg_future )
+    , m_priority( static_cast<int>( arg_priority ) )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicyData( scheduler_type       const & arg_scheduler
+                , Kokkos::TaskPriority const & arg_priority )
+    : m_scheduler( & arg_scheduler )
+    , m_dependence()
+    , m_priority( static_cast<int>( arg_priority ) )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicyData( scheduler_type       const & arg_scheduler
+                , DepFutureType        const & arg_future
+                , Kokkos::TaskPriority const & arg_priority )
+    : m_scheduler( & arg_scheduler )
+    , m_dependence( arg_future )
+    , m_priority( static_cast<int>( arg_priority ) )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< typename ExecSpace >
+class TaskScheduler
+{
+private:
+
+  using track_type = Kokkos::Impl::SharedAllocationTracker ;
+  using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
+  using task_base  = Impl::TaskBase< void , void , void > ;
+
+  track_type   m_track ;
+  queue_type * m_queue ;
+
+  //----------------------------------------
+
+public:
+
+  using execution_space  = ExecSpace ;
+  using memory_space     = typename queue_type::memory_space ;
+  using memory_pool      = typename queue_type::memory_pool ;
+  using member_type      =
+    typename Kokkos::Impl::TaskQueueSpecialization< ExecSpace >::member_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskScheduler() : m_track(), m_queue(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskScheduler( TaskScheduler && rhs )
+    : m_track( rhs.m_track ), m_queue( rhs.m_queue ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskScheduler( TaskScheduler const & rhs )
+    : m_track( rhs.m_track ), m_queue( rhs.m_queue ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskScheduler & operator = ( TaskScheduler && rhs )
+    { m_track = rhs.m_track ; m_queue =  rhs.m_queue ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  TaskScheduler & operator = ( TaskScheduler const & rhs )
+    { m_track = rhs.m_track ; m_queue =  rhs.m_queue ; return *this ; }
+
+  TaskScheduler( memory_pool const & arg_memory_pool )
+    : m_track()
+    , m_queue(0)
+    {
+      typedef Kokkos::Impl::SharedAllocationRecord
+        < memory_space , typename queue_type::Destroy >
+          record_type ;
+
+      record_type * record =
+        record_type::allocate( memory_space()
+                             , "TaskQueue"
+                             , sizeof(queue_type)
+                             );
+
+      m_queue = new( record->data() ) queue_type( arg_memory_pool );
+
+      record->m_destroy.m_queue = m_queue ;
+
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+
+  TaskScheduler( memory_space const & arg_memory_space
+               , size_t const mempool_capacity
+               , unsigned const mempool_min_block_size  // = 1u << 6
+               , unsigned const mempool_max_block_size  // = 1u << 10
+               , unsigned const mempool_superblock_size // = 1u << 12
+               )
+    : TaskScheduler( memory_pool( arg_memory_space
+                                , mempool_capacity
+                                , mempool_min_block_size
+                                , mempool_max_block_size
+                                , mempool_superblock_size ) )
+    {}
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  memory_pool * memory() const noexcept
+    { return m_queue ? &( m_queue->m_memory ) : (memory_pool*) 0 ; }
+
+  //----------------------------------------
+  /**\brief  Allocation size for a spawned task */
+  template< typename FunctorType >
+  KOKKOS_FUNCTION
+  size_t spawn_allocation_size() const
+    { return m_queue->template spawn_allocation_size< FunctorType >(); }
+
+  /**\brief  Allocation size for a when_all aggregate */
+  KOKKOS_FUNCTION
+  size_t when_all_allocation_size( int narg ) const
+    { return m_queue->when_all_allocation_size( narg ); }
+
+  //----------------------------------------
+
+  template< int TaskEnum , typename DepFutureType , typename FunctorType >
+  KOKKOS_FUNCTION static
+  Kokkos::Future< typename FunctorType::value_type , execution_space >
+  spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+       , typename task_base::function_type                    arg_function
+       , FunctorType                                       && arg_functor
+       )
+    {
+      using value_type  = typename FunctorType::value_type ;
+      using future_type = Future< value_type , execution_space > ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      queue_type * const queue =
+        arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : (
+        arg_policy.m_dependence.m_task
+          ? static_cast<queue_type*>(arg_policy.m_dependence.m_task->m_queue)
+          : (queue_type*) 0 );
+
+      if ( 0 == queue ) {
+        Kokkos::abort("Kokkos spawn requires scheduler or non-null Future");
+      }
+
+      if ( arg_policy.m_dependence.m_task != 0 &&
+           arg_policy.m_dependence.m_task->m_queue != queue ) {
+        Kokkos::abort("Kokkos spawn given incompatible scheduler and Future");
+      }
+
+      //----------------------------------------
+      // Give single-thread back-ends an opportunity to clear
+      // queue of ready tasks before allocating a new task
+
+      queue->iff_single_thread_recursive_execute();
+
+      //----------------------------------------
+
+      future_type f ;
+
+      // Allocate task from memory pool
+
+      const size_t alloc_size =
+        queue->template spawn_allocation_size< FunctorType >();
+
+      f.m_task =
+        reinterpret_cast< task_type * >(queue->allocate(alloc_size) );
+
+      if ( f.m_task ) {
+
+        // Placement new construction
+        // Reference count starts at two:
+        //   +1 for the matching decrement when task is complete
+        //   +1 for the future
+        new ( f.m_task ) task_type( std::move(arg_functor) );
+
+        f.m_task->m_apply      = arg_function ;
+        f.m_task->m_queue      = queue ;
+        f.m_task->m_next       = arg_policy.m_dependence.m_task ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = alloc_size ;
+        f.m_task->m_task_type  = arg_policy.m_task_type ;
+        f.m_task->m_priority   = arg_policy.m_priority ;
+
+        Kokkos::memory_fence();
+
+        // The dependence (if any) is processed immediately
+        // within the schedule function, as such the dependence's
+        // reference count does not need to be incremented for
+        // the assignment.
+
+        queue->schedule_runnable( f.m_task );
+        // This task may be updated or executed at any moment,
+        // even during the call to 'schedule'.
+      }
+
+      return f ;
+    }
+
+  template< typename FunctorType , typename A1 , typename A2 >
+  KOKKOS_FUNCTION static
+  void
+  respawn( FunctorType         * arg_self
+         , Future<A1,A2> const & arg_dependence
+         , TaskPriority  const & arg_priority
+         )
+    {
+      // Precondition: task is in Executing state
+
+      using value_type  = typename FunctorType::value_type ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      task_type * const task = static_cast< task_type * >( arg_self );
+
+      task->m_priority = static_cast<int>(arg_priority);
+
+      task->add_dependence( arg_dependence.m_task );
+
+      // Postcondition: task is in Executing-Respawn state
+    }
+
+  template< typename FunctorType >
+  KOKKOS_FUNCTION static
+  void
+  respawn( FunctorType         * arg_self
+         , TaskScheduler const &
+         , TaskPriority  const & arg_priority
+         )
+    {
+      // Precondition: task is in Executing state
+
+      using value_type  = typename FunctorType::value_type ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      task_type * const task = static_cast< task_type * >( arg_self );
+
+      task->m_priority = static_cast<int>(arg_priority);
+
+      task->add_dependence( (task_base*) 0 );
+
+      // Postcondition: task is in Executing-Respawn state
+    }
+
+  //----------------------------------------
+  /**\brief  Return a future that is complete
+   *         when all input futures are complete.
+   */
+  template< typename A1 , typename A2 >
+  KOKKOS_FUNCTION static
+  Future< execution_space >
+  when_all( Future< A1 , A2 > const arg[] , int narg )
+    {
+      using future_type = Future< execution_space > ;
+      using task_base   = Kokkos::Impl::TaskBase< void , void , void > ;
+
+      future_type f ;
+
+      if ( narg ) {
+
+        queue_type * queue = 0 ;
+
+        for ( int i = 0 ; i < narg ; ++i ) {
+          task_base * const t = arg[i].m_task ;
+          if ( 0 != t ) {
+            // Increment reference count to track subsequent assignment.
+            Kokkos::atomic_increment( &(t->m_ref_count) );
+            if ( queue == 0 ) {
+              queue = static_cast< queue_type * >( t->m_queue );
+            }
+            else if ( queue != static_cast< queue_type * >( t->m_queue ) ) {
+              Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
+            }
+          }
+        }
+
+        if ( queue != 0 ) {
+
+          size_t const alloc_size = queue->when_all_allocation_size( narg );
+
+          f.m_task =
+            reinterpret_cast< task_base * >( queue->allocate( alloc_size ) );
+
+          if ( f.m_task ) {
+
+            // Reference count starts at two:
+            // +1 to match decrement when task completes
+            // +1 for the future
+
+            new( f.m_task ) task_base();
+
+            f.m_task->m_queue      = queue ;
+            f.m_task->m_ref_count  = 2 ;
+            f.m_task->m_alloc_size = alloc_size ;
+            f.m_task->m_dep_count  = narg ;
+            f.m_task->m_task_type  = task_base::Aggregate ;
+
+            // Assign dependences, reference counts were already incremented
+
+            task_base * volatile * const dep =
+              f.m_task->aggregate_dependences();
+
+            for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; }
+
+            Kokkos::memory_fence();
+
+            queue->schedule_aggregate( f.m_task );
+            // this when_all may be processed at any moment
+          }
+        }
+      }
+
+      return f ;
+    }
+
+  template < class F >
+  KOKKOS_FUNCTION
+  Future< execution_space >
+  when_all( int narg , F const func )
+    {
+      using input_type  = decltype( func(0) );
+      using future_type = Future< execution_space > ;
+      using task_base   = Kokkos::Impl::TaskBase< void , void , void > ;
+
+      static_assert( is_future< input_type >::value
+                   , "Functor must return a Kokkos::Future" );
+
+      future_type f ;
+
+      if ( 0 == narg ) return f ;
+
+      size_t const alloc_size = m_queue->when_all_allocation_size( narg );
+
+      f.m_task =
+        reinterpret_cast< task_base * >( m_queue->allocate( alloc_size ) );
+
+      if ( f.m_task ) {
+
+        // Reference count starts at two:
+        // +1 to match decrement when task completes
+        // +1 for the future
+
+        new( f.m_task ) task_base();
+
+        f.m_task->m_queue      = m_queue ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = alloc_size ;
+        f.m_task->m_dep_count  = narg ;
+        f.m_task->m_task_type  = task_base::Aggregate ;
+
+        // Assign dependences, reference counts were already incremented
+
+        task_base * volatile * const dep =
+          f.m_task->aggregate_dependences();
+
+        for ( int i = 0 ; i < narg ; ++i ) {
+          const input_type arg_f = func(i);
+          if ( 0 != arg_f.m_task ) {
+
+            if ( m_queue != static_cast< queue_type * >( arg_f.m_task->m_queue ) ) {
+              Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
+            }
+            // Increment reference count to track subsequent assignment.
+            Kokkos::atomic_increment( &(arg_f.m_task->m_ref_count) );
+            dep[i] = arg_f.m_task ;
+          }
+        }
+
+        Kokkos::memory_fence();
+
+        m_queue->schedule_aggregate( f.m_task );
+        // this when_all may be processed at any moment
+      }
+      return f ;
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int allocation_capacity() const noexcept
+    { return m_queue->m_memory.capacity(); }
+
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count() const noexcept
+    { return m_queue->m_count_alloc ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count_max() const noexcept
+    { return m_queue->m_max_alloc ; }
+
+  KOKKOS_INLINE_FUNCTION
+  long allocated_task_count_accum() const noexcept
+    { return m_queue->m_accum_alloc ; }
+
+  //----------------------------------------
+
+  template< typename S >
+  friend
+  void Kokkos::wait( Kokkos::TaskScheduler< S > const & );
+
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+// Construct a TaskTeam execution policy
+
+template< typename T >
+Kokkos::Impl::TaskPolicyData
+  < Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
+  , typename std::conditional< Kokkos::is_future< T >::value , T ,
+    typename Kokkos::Future< typename T::execution_space > >::type
+  >
+KOKKOS_INLINE_FUNCTION
+TaskTeam( T            const & arg
+        , TaskPriority const & arg_priority = TaskPriority::Regular
+        )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos TaskTeam argument must be Future or TaskScheduler" );
+
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
+      , typename std::conditional< Kokkos::is_future< T >::value , T ,
+        typename Kokkos::Future< typename T::execution_space > >::type
+      >( arg , arg_priority );
+}
+
+template< typename E , typename F >
+Kokkos::Impl::
+  TaskPolicyData< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam , F >
+KOKKOS_INLINE_FUNCTION
+TaskTeam( TaskScheduler<E> const & arg_scheduler
+        , F                const & arg_future
+        , typename std::enable_if< Kokkos::is_future<F>::value ,
+            TaskPriority >::type const & arg_priority = TaskPriority::Regular
+        )
+{
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskTeam , F >
+        ( arg_scheduler , arg_future , arg_priority );
+}
+
+// Construct a TaskSingle execution policy
+
+template< typename T >
+Kokkos::Impl::TaskPolicyData
+  < Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
+  , typename std::conditional< Kokkos::is_future< T >::value , T ,
+    typename Kokkos::Future< typename T::execution_space > >::type
+  >
+KOKKOS_INLINE_FUNCTION
+TaskSingle( T            const & arg
+          , TaskPriority const & arg_priority = TaskPriority::Regular
+          )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos TaskSingle argument must be Future or TaskScheduler" );
+
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
+      , typename std::conditional< Kokkos::is_future< T >::value , T ,
+        typename Kokkos::Future< typename T::execution_space > >::type
+      >( arg , arg_priority );
+}
+
+template< typename E , typename F >
+Kokkos::Impl::
+  TaskPolicyData< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle , F >
+KOKKOS_INLINE_FUNCTION
+TaskSingle( TaskScheduler<E> const & arg_scheduler
+          , F                const & arg_future
+          , typename std::enable_if< Kokkos::is_future<F>::value ,
+              TaskPriority >::type const & arg_priority = TaskPriority::Regular
+          )
+{
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskSingle , F >
+        ( arg_scheduler , arg_future , arg_priority );
+}
+
+//----------------------------------------------------------------------------
+
+/**\brief  A host control thread spawns a task with options
+ *
+ *  1) Team or Serial
+ *  2) With scheduler or dependence
+ *  3) High, Normal, or Low priority
+ */
+template< int TaskEnum
+        , typename DepFutureType
+        , typename FunctorType >
+Future< typename FunctorType::value_type
+      , typename DepFutureType::execution_space >
+host_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+          , FunctorType                                       && arg_functor
+          )
+{
+  using exec_space = typename DepFutureType::execution_space ;
+  using scheduler  = TaskScheduler< exec_space > ;
+
+  typedef Impl::TaskBase< exec_space
+                        , typename FunctorType::value_type
+                        , FunctorType
+                        > task_type ;
+
+  static_assert( TaskEnum == task_type::TaskTeam ||
+                 TaskEnum == task_type::TaskSingle
+               , "Kokkos host_spawn requires TaskTeam or TaskSingle" );
+
+  // May be spawning a Cuda task, must use the specialization
+  // to query on-device function pointer.
+  typename task_type::function_type const ptr =
+    Kokkos::Impl::TaskQueueSpecialization< exec_space >::
+      template get_function_pointer< task_type >();
+
+  return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
+}
+
+/**\brief  A task spawns a task with options
+ *
+ *  1) Team or Serial
+ *  2) With scheduler or dependence
+ *  3) High, Normal, or Low priority
+ */
+template< int TaskEnum
+        , typename DepFutureType
+        , typename FunctorType >
+Future< typename FunctorType::value_type
+      , typename DepFutureType::execution_space >
+KOKKOS_INLINE_FUNCTION
+task_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+          , FunctorType                                       && arg_functor
+          )
+{
+  using exec_space = typename DepFutureType::execution_space ;
+  using scheduler  = TaskScheduler< exec_space > ;
+
+  typedef Impl::TaskBase< exec_space
+                        , typename FunctorType::value_type
+                        , FunctorType
+                        > task_type ;
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) && \
+    defined( KOKKOS_ENABLE_CUDA )
+
+  static_assert( ! std::is_same< Kokkos::Cuda , exec_space >::value
+               , "Error calling Kokkos::task_spawn for Cuda space within Host code" );
+
+#endif
+
+  static_assert( TaskEnum == task_type::TaskTeam ||
+                 TaskEnum == task_type::TaskSingle
+               , "Kokkos host_spawn requires TaskTeam or TaskSingle" );
+
+  typename task_type::function_type const ptr = task_type::apply ;
+
+  return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
+}
+
+/**\brief  A task respawns itself with options
+ *
+ *  1) With scheduler or dependence
+ *  2) High, Normal, or Low priority
+ */
+template< typename FunctorType , typename T >
+void
+KOKKOS_INLINE_FUNCTION
+respawn( FunctorType         * arg_self
+       , T             const & arg
+       , TaskPriority  const & arg_priority = TaskPriority::Regular
+       )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos respawn argument must be Future or TaskScheduler" );
+
+  TaskScheduler< typename T::execution_space >::
+    respawn( arg_self , arg , arg_priority );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename A1 , typename A2 >
+KOKKOS_INLINE_FUNCTION
+Future< typename Future< A1 , A2 >::execution_space >
+when_all( Future< A1 , A2 > const arg[]
+        , int                     narg
+        )
+{
+  return TaskScheduler< typename Future<A1,A2>::execution_space >::
+    when_all( arg , narg );
+}
+
+//----------------------------------------------------------------------------
+// Wait for all runnable tasks to complete
+
+template< typename ExecSpace >
+inline
+void wait( TaskScheduler< ExecSpace > const & scheduler )
+{ scheduler.m_queue->execute(); }
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
+#endif /* #ifndef KOKKOS_TASKSCHEDULER_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_Threads.hpp b/packages/kokkos/core/src/Kokkos_Threads.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b135022deb71783e378540129ebf894e27880a5a
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Threads.hpp
@@ -0,0 +1,234 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_HPP
+#define KOKKOS_THREADS_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_THREADS )
+
+#include <Kokkos_Core_fwd.hpp>
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class ThreadsExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Execution space for a pool of Pthreads or C11 threads on a CPU. */
+class Threads {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+  //! Tag this class as a kokkos execution space
+  typedef Threads                  execution_space ;
+  typedef Kokkos::HostSpace        memory_space ;
+
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef Kokkos::LayoutRight      array_layout ;
+  typedef memory_space::size_type  size_type ;
+
+  typedef ScratchMemorySpace< Threads >  scratch_memory_space ;
+
+
+  //@}
+  /*------------------------------------------------------------------------*/
+  //! \name Static functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  static int in_parallel();
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  /// \brief Free any resources being consumed by the device.
+  ///
+  /// For the Threads device, this terminates spawned worker threads.
+  static void finalize();
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  /*------------------------------------------------------------------------*/
+  /*------------------------------------------------------------------------*/
+  //! \name Space-specific functions
+  //@{
+
+  /** \brief Initialize the device in the "ready to work" state.
+   *
+   *  The device is initialized in a "ready to work" or "awake" state.
+   *  This state reduces latency and thus improves performance when
+   *  dispatching work.  However, the "awake" state consumes resources
+   *  even when no work is being done.  You may call sleep() to put
+   *  the device in a "sleeping" state that does not consume as many
+   *  resources, but it will take time (latency) to awaken the device
+   *  again (via the wake()) method so that it is ready for work.
+   *
+   *  Teams of threads are distributed as evenly as possible across
+   *  the requested number of numa regions and cores per numa region.
+   *  A team will not be split across a numa region.
+   *
+   *  If the 'use_' arguments are not supplied the hwloc is queried
+   *  to use all available cores.
+   */
+  static void initialize( unsigned threads_count = 0 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false );
+
+  static int is_initialized();
+
+  /** \brief  Return the maximum amount of concurrency.  */
+  static int concurrency();
+
+  static Threads & instance( int = 0 );
+
+  //----------------------------------------
+
+  static int thread_pool_size( int depth = 0 );
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static int thread_pool_rank();
+#else
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
+#endif
+
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
+
+  static const char* name();
+  //@}
+  //----------------------------------------
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct MemorySpaceAccess
+  < Kokkos::Threads::memory_space
+  , Kokkos::Threads::scratch_memory_space
+  >
+{
+  enum { assignable = false };
+  enum { accessible = true };
+  enum { deepcopy   = false };
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Threads::memory_space
+  , Kokkos::Threads::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Threads/Kokkos_ThreadsExec.hpp>
+#include <Threads/Kokkos_ThreadsTeam.hpp>
+#include <Threads/Kokkos_Threads_Parallel.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
+#endif /* #define KOKKOS_THREADS_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_Timer.hpp b/packages/kokkos/core/src/Kokkos_Timer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f6ec4a079a6433a1bbf06daea5c457e3b70dd9b3
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Timer.hpp
@@ -0,0 +1,113 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TIMER_HPP
+#define KOKKOS_TIMER_HPP
+
+#include <cstddef>
+
+#ifdef _MSC_VER
+#undef KOKKOS_ENABLE_LIBRT
+#include <gettimeofday.c>
+#else
+#ifdef KOKKOS_ENABLE_LIBRT
+#include <ctime>
+#else
+#include <sys/time.h>
+#endif
+#endif
+
+namespace Kokkos {
+
+/** \brief  Time since construction */
+
+class Timer {
+private:
+  #ifdef KOKKOS_ENABLE_LIBRT
+	struct timespec m_old;
+  #else
+	struct timeval m_old ;
+  #endif
+  Timer( const Timer & );
+  Timer & operator = ( const Timer & );
+public:
+
+  inline
+  void reset() {
+    #ifdef KOKKOS_ENABLE_LIBRT
+	  clock_gettime(CLOCK_REALTIME, &m_old);
+    #else
+	  gettimeofday( & m_old , ((struct timezone *) NULL ) );
+    #endif
+  }
+
+  inline
+  ~Timer() {}
+
+  inline
+  Timer() { reset(); }
+
+  inline
+  double seconds() const
+  {
+    #ifdef KOKKOS_ENABLE_LIBRT
+      struct timespec m_new;
+      clock_gettime(CLOCK_REALTIME, &m_new);
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_nsec - m_old.tv_nsec ) * 1.0e-9 );
+    #else
+      struct timeval m_new ;
+
+      gettimeofday( & m_new , ((struct timezone *) NULL ) );
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_usec - m_old.tv_usec ) * 1.0e-6 );
+    #endif
+  }
+};
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_TIMER_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_UniqueToken.hpp b/packages/kokkos/core/src/Kokkos_UniqueToken.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d89542631d658af1ede69cab5956af9b7ac6e448
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_UniqueToken.hpp
@@ -0,0 +1,88 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_UNIQUE_TOKEN_HPP
+#define KOKKOS_UNIQUE_TOKEN_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos { namespace Experimental {
+
+enum class UniqueTokenScope : int
+{
+  Instance,
+  Global
+};
+
+/// \brief class to generate unique ids base on the required amount of concurrency
+///
+/// This object should behave like a ref-counted object, so that when the last
+/// instance is destroy resources are free if needed
+template <typename ExecutionSpace, UniqueTokenScope = UniqueTokenScope::Instance >
+class UniqueToken
+{
+public:
+  using execution_space = ExecutionSpace;
+  using size_type       = typename execution_space::size_type;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() );
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  size_type size() const ;
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  size_type acquire() const ;
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release( size_type ) const ;
+};
+
+}} // namespace Kokkos::Experimental
+
+#endif //KOKKOS_UNIQUE_TOKEN_HPP
diff --git a/packages/kokkos/core/src/Kokkos_Vectorization.hpp b/packages/kokkos/core/src/Kokkos_Vectorization.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f5b6cad4b7f4285d6cddac640e4e8f68eecf65f9
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Vectorization.hpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Vectorization.hpp
+/// \brief Declaration and definition of Kokkos::Vectorization interface.
+#ifndef KOKKOS_VECTORIZATION_HPP
+#define KOKKOS_VECTORIZATION_HPP
+
+#if defined( KOKKOS_ENABLE_CUDA )
+#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+#endif
+
+#endif
+
diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d8e6321f34ac6011479f792ac365acf1be272bbb
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_View.hpp
@@ -0,0 +1,2658 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEW_HPP
+#define KOKKOS_VIEW_HPP
+
+#include <type_traits>
+#include <string>
+#include <algorithm>
+#include <initializer_list>
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class DataType >
+struct ViewArrayAnalysis ;
+
+template< class DataType , class ArrayLayout
+        , typename ValueType =
+          typename ViewArrayAnalysis< DataType >::non_const_value_type
+        >
+struct ViewDataAnalysis ;
+
+template< class , class ... >
+class ViewMapping { public: enum { is_assignable = false }; };
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \class ViewTraits
+ *  \brief Traits class for accessing attributes of a View.
+ *
+ * This is an implementation detail of View.  It is only of interest
+ * to developers implementing a new specialization of View.
+ *
+ * Template argument options:
+ *   - View< DataType >
+ *   - View< DataType , Space >
+ *   - View< DataType , Space , MemoryTraits >
+ *   - View< DataType , ArrayLayout >
+ *   - View< DataType , ArrayLayout , Space >
+ *   - View< DataType , ArrayLayout , MemoryTraits >
+ *   - View< DataType , ArrayLayout , Space , MemoryTraits >
+ *   - View< DataType , MemoryTraits >
+ */
+
+template< class DataType , class ... Properties >
+struct ViewTraits ;
+
+template<>
+struct ViewTraits< void >
+{
+  typedef void  execution_space ;
+  typedef void  memory_space ;
+  typedef void  HostMirrorSpace ;
+  typedef void  array_layout ;
+  typedef void  memory_traits ;
+};
+
+template< class ... Prop >
+struct ViewTraits< void , void , Prop ... >
+{
+  // Ignore an extraneous 'void'
+  typedef typename ViewTraits<void,Prop...>::execution_space  execution_space ;
+  typedef typename ViewTraits<void,Prop...>::memory_space     memory_space ;
+  typedef typename ViewTraits<void,Prop...>::HostMirrorSpace  HostMirrorSpace ;
+  typedef typename ViewTraits<void,Prop...>::array_layout     array_layout ;
+  typedef typename ViewTraits<void,Prop...>::memory_traits    memory_traits ;
+};
+
+template< class ArrayLayout , class ... Prop >
+struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_array_layout<ArrayLayout>::value >::type , ArrayLayout , Prop ... >
+{
+  // Specify layout, keep subsequent space and memory traits arguments
+
+  typedef typename ViewTraits<void,Prop...>::execution_space  execution_space ;
+  typedef typename ViewTraits<void,Prop...>::memory_space     memory_space ;
+  typedef typename ViewTraits<void,Prop...>::HostMirrorSpace  HostMirrorSpace ;
+  typedef          ArrayLayout                                array_layout ;
+  typedef typename ViewTraits<void,Prop...>::memory_traits    memory_traits ;
+};
+
+template< class Space , class ... Prop >
+struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_space<Space>::value >::type , Space , Prop ... >
+{
+  // Specify Space, memory traits should be the only subsequent argument.
+
+  static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::memory_space    , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::HostMirrorSpace , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::array_layout    , void >::value
+               , "Only one View Execution or Memory Space template argument" );
+
+  typedef typename Space::execution_space                   execution_space ;
+  typedef typename Space::memory_space                      memory_space ;
+  typedef typename Kokkos::Impl::HostMirror< Space >::Space HostMirrorSpace ;
+  typedef typename execution_space::array_layout            array_layout ;
+  typedef typename ViewTraits<void,Prop...>::memory_traits  memory_traits ;
+};
+
+template< class MemoryTraits , class ... Prop >
+struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_memory_traits<MemoryTraits>::value >::type , MemoryTraits , Prop ... >
+{
+  // Specify memory trait, should not be any subsequent arguments
+
+  static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::memory_space    , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::array_layout    , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::memory_traits   , void >::value
+               , "MemoryTrait is the final optional template argument for a View" );
+
+  typedef void          execution_space ;
+  typedef void          memory_space ;
+  typedef void          HostMirrorSpace ;
+  typedef void          array_layout ;
+  typedef MemoryTraits  memory_traits ;
+};
+
+
+template< class DataType , class ... Properties >
+struct ViewTraits {
+private:
+
+  // Unpack the properties arguments
+  typedef ViewTraits< void , Properties ... >  prop ;
+
+  typedef typename
+    std::conditional< ! std::is_same< typename prop::execution_space , void >::value
+                    , typename prop::execution_space
+                    , Kokkos::DefaultExecutionSpace
+                    >::type
+      ExecutionSpace ;
+
+  typedef typename
+    std::conditional< ! std::is_same< typename prop::memory_space , void >::value
+                    , typename prop::memory_space
+                    , typename ExecutionSpace::memory_space
+                    >::type
+      MemorySpace ;
+
+  typedef typename
+    std::conditional< ! std::is_same< typename prop::array_layout , void >::value
+                    , typename prop::array_layout
+                    , typename ExecutionSpace::array_layout
+                    >::type
+      ArrayLayout ;
+
+  typedef typename
+    std::conditional
+      < ! std::is_same< typename prop::HostMirrorSpace , void >::value
+      , typename prop::HostMirrorSpace
+      , typename Kokkos::Impl::HostMirror< ExecutionSpace >::Space
+      >::type
+      HostMirrorSpace ;
+
+  typedef typename
+    std::conditional< ! std::is_same< typename prop::memory_traits , void >::value
+                    , typename prop::memory_traits
+                    , typename Kokkos::MemoryManaged
+                    >::type
+      MemoryTraits ;
+
+  // Analyze data type's properties,
+  // May be specialized based upon the layout and value type
+  typedef Kokkos::Impl::ViewDataAnalysis< DataType , ArrayLayout > data_analysis ;
+
+public:
+
+  //------------------------------------
+  // Data type traits:
+
+  typedef typename data_analysis::type            data_type ;
+  typedef typename data_analysis::const_type      const_data_type ;
+  typedef typename data_analysis::non_const_type  non_const_data_type ;
+
+  //------------------------------------
+  // Compatible array of trivial type traits:
+
+  typedef typename data_analysis::scalar_array_type            scalar_array_type ;
+  typedef typename data_analysis::const_scalar_array_type      const_scalar_array_type ;
+  typedef typename data_analysis::non_const_scalar_array_type  non_const_scalar_array_type ;
+
+  //------------------------------------
+  // Value type traits:
+
+  typedef typename data_analysis::value_type            value_type ;
+  typedef typename data_analysis::const_value_type      const_value_type ;
+  typedef typename data_analysis::non_const_value_type  non_const_value_type ;
+
+  //------------------------------------
+  // Mapping traits:
+
+  typedef ArrayLayout                         array_layout ;
+  typedef typename data_analysis::dimension   dimension ;
+  typedef typename data_analysis::specialize  specialize /* mapping specialization tag */ ;
+
+  enum { rank         = dimension::rank };
+  enum { rank_dynamic = dimension::rank_dynamic };
+
+  //------------------------------------
+  // Execution space, memory space, memory access traits, and host mirror space.
+
+  typedef ExecutionSpace                              execution_space ;
+  typedef MemorySpace                                 memory_space ;
+  typedef Kokkos::Device<ExecutionSpace,MemorySpace>  device_type ;
+  typedef MemoryTraits                                memory_traits ;
+  typedef HostMirrorSpace                             host_mirror_space ;
+
+  typedef typename MemorySpace::size_type  size_type ;
+
+  enum { is_hostspace      = std::is_same< MemorySpace , HostSpace >::value };
+  enum { is_managed        = MemoryTraits::Unmanaged    == 0 };
+  enum { is_random_access  = MemoryTraits::RandomAccess == 1 };
+
+  //------------------------------------
+};
+
+/** \class View
+ *  \brief View to an array of data.
+ *
+ * A View represents an array of one or more dimensions.
+ * For details, please refer to Kokkos' tutorial materials.
+ *
+ * \section Kokkos_View_TemplateParameters Template parameters
+ *
+ * This class has both required and optional template parameters.  The
+ * \c DataType parameter must always be provided, and must always be
+ * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are
+ * placeholders for different template parameters.  The default value
+ * of the fifth template parameter \c Specialize suffices for most use
+ * cases.  When explaining the template parameters, we won't refer to
+ * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer
+ * to the valid categories of template parameters, in whatever order
+ * they may occur.
+ *
+ * Valid ways in which template arguments may be specified:
+ *   - View< DataType >
+ *   - View< DataType , Layout >
+ *   - View< DataType , Layout , Space >
+ *   - View< DataType , Layout , Space , MemoryTraits >
+ *   - View< DataType , Space >
+ *   - View< DataType , Space , MemoryTraits >
+ *   - View< DataType , MemoryTraits >
+ *
+ * \tparam DataType (required) This indicates both the type of each
+ *   entry of the array, and the combination of compile-time and
+ *   run-time array dimension(s).  For example, <tt>double*</tt>
+ *   indicates a one-dimensional array of \c double with run-time
+ *   dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int
+ *   with run-time first dimension and compile-time second dimension
+ *   (of 3).  In general, the run-time dimensions (if any) must go
+ *   first, followed by zero or more compile-time dimensions.  For
+ *   more examples, please refer to the tutorial materials.
+ *
+ * \tparam Space (required) The memory space.
+ *
+ * \tparam Layout (optional) The array's layout in memory.  For
+ *   example, LayoutLeft indicates a column-major (Fortran style)
+ *   layout, and LayoutRight a row-major (C style) layout.  If not
+ *   specified, this defaults to the preferred layout for the
+ *   <tt>Space</tt>.
+ *
+ * \tparam MemoryTraits (optional) Assertion of the user's intended
+ *   access behavior.  For example, RandomAccess indicates read-only
+ *   access with limited spatial locality, and Unmanaged lets users
+ *   wrap externally allocated memory in a View without automatic
+ *   deallocation.
+ *
+ * \section Kokkos_View_MT MemoryTraits discussion
+ *
+ * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on Space
+ *
+ * Some \c MemoryTraits options may have different interpretations for
+ * different \c Space types.  For example, with the Cuda device,
+ * \c RandomAccess tells Kokkos to fetch the data through the texture
+ * cache, whereas the non-GPU devices have no such hardware construct.
+ *
+ * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits
+ *
+ * Users should defer applying the optional \c MemoryTraits parameter
+ * until the point at which they actually plan to rely on it in a
+ * computational kernel.  This minimizes the number of template
+ * parameters exposed in their code, which reduces the cost of
+ * compilation.  Users may always assign a View without specified
+ * \c MemoryTraits to a compatible View with that specification.
+ * For example:
+ * \code
+ * // Pass in the simplest types of View possible.
+ * void
+ * doSomething (View<double*, Cuda> out,
+ *              View<const double*, Cuda> in)
+ * {
+ *   // Assign the "generic" View in to a RandomAccess View in_rr.
+ *   // Note that RandomAccess View objects must have const data.
+ *   View<const double*, Cuda, RandomAccess> in_rr = in;
+ *   // ... do something with in_rr and out ...
+ * }
+ * \endcode
+ */
+template< class DataType , class ... Properties >
+class View ;
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_ViewMapping.hpp>
+#include <impl/Kokkos_ViewArray.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace {
+
+constexpr Kokkos::Impl::ALL_t
+  ALL = Kokkos::Impl::ALL_t();
+
+constexpr Kokkos::Impl::WithoutInitializing_t
+  WithoutInitializing = Kokkos::Impl::WithoutInitializing_t();
+
+constexpr Kokkos::Impl::AllowPadding_t
+  AllowPadding        = Kokkos::Impl::AllowPadding_t();
+
+}
+
+/** \brief  Create View allocation parameter bundle from argument list.
+ *
+ *  Valid argument list members are:
+ *    1) label as a "string" or std::string
+ *    2) memory space instance of the View::memory_space type
+ *    3) execution space instance compatible with the View::memory_space
+ *    4) Kokkos::WithoutInitializing to bypass initialization
+ *    4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory alignment
+ */
+template< class ... Args >
+inline
+Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+view_alloc( Args const & ... args )
+{
+  typedef
+    Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+      return_type ;
+
+  static_assert( ! return_type::has_pointer
+               , "Cannot give pointer-to-memory for view allocation" );
+
+  return return_type( args... );
+}
+
+template< class ... Args >
+KOKKOS_INLINE_FUNCTION
+Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+view_wrap( Args const & ... args )
+{
+  typedef
+    Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+      return_type ;
+
+  static_assert( ! return_type::has_memory_space &&
+                 ! return_type::has_execution_space &&
+                 ! return_type::has_label &&
+                 return_type::has_pointer
+               , "Must only give pointer-to-memory for view wrapping" );
+
+  return return_type( args... );
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class DataType , class ... Properties >
+class View ;
+
+template< class > struct is_view : public std::false_type {};
+
+template< class D, class ... P >
+struct is_view< View<D,P...> > : public std::true_type {};
+
+template< class D, class ... P >
+struct is_view< const View<D,P...> > : public std::true_type {};
+
+template< class DataType , class ... Properties >
+class View : public ViewTraits< DataType , Properties ... > {
+private:
+
+  template< class , class ... > friend class View ;
+  template< class , class ... > friend class Kokkos::Impl::ViewMapping ;
+
+public:
+
+  typedef ViewTraits< DataType , Properties ... > traits ;
+
+private:
+
+  typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
+  typedef Kokkos::Impl::SharedAllocationTracker      track_type ;
+
+  track_type  m_track ;
+  map_type    m_map ;
+
+public:
+
+  //----------------------------------------
+  /** \brief  Compatible view of array of scalar types */
+  typedef View< typename traits::scalar_array_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits >
+    array_type ;
+
+  /** \brief  Compatible view of const data type */
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits >
+    const_type ;
+
+  /** \brief  Compatible view of non-const data type */
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits >
+    non_const_type ;
+
+  /** \brief  Compatible HostMirror view */
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::host_mirror_space >
+    HostMirror ;
+
+  //----------------------------------------
+  // Domain rank and extents
+
+  enum { Rank = map_type::Rank };
+
+ /** \brief rank() to be implemented
+  */
+  //KOKKOS_INLINE_FUNCTION
+  //static
+  //constexpr unsigned rank() { return map_type::Rank; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  extent( const iType & r ) const
+    { return m_map.extent(r); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , int >::type
+  extent_int( const iType & r ) const
+    { return static_cast<int>(m_map.extent(r)); }
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename traits::array_layout layout() const
+    { return m_map.layout(); }
+
+  //----------------------------------------
+  /*  Deprecate all 'dimension' functions in favor of
+   *  ISO/C++ vocabulary 'extent'.
+   */
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  dimension( const iType & r ) const { return extent( r ); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); }
+
+#endif
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return m_map.dimension_0() *
+                                                                m_map.dimension_1() *
+                                                                m_map.dimension_2() *
+                                                                m_map.dimension_3() *
+                                                                m_map.dimension_4() *
+                                                                m_map.dimension_5() *
+                                                                m_map.dimension_6() *
+                                                                m_map.dimension_7(); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  stride(iType r) const {
+    return (r == 0 ? m_map.stride_0() :
+           (r == 1 ? m_map.stride_1() :
+           (r == 2 ? m_map.stride_2() :
+           (r == 3 ? m_map.stride_3() :
+           (r == 4 ? m_map.stride_4() :
+           (r == 5 ? m_map.stride_5() :
+           (r == 6 ? m_map.stride_6() :
+                     m_map.stride_7())))))));
+  }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_map.stride(s); }
+
+  //----------------------------------------
+  // Range span is the span which contains all members.
+
+  typedef typename map_type::reference_type  reference_type ;
+  typedef typename map_type::pointer_type    pointer_type ;
+
+  enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value };
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+  // Deprecated, use 'span()' instead
+  KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const { return m_map.span(); }
+#endif
+  KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { return m_map.span_is_contiguous(); }
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return m_map.data(); }
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+  // Deprecated, use 'span_is_contigous()' instead
+  KOKKOS_INLINE_FUNCTION constexpr bool   is_contiguous() const { return m_map.span_is_contiguous(); }
+  // Deprecated, use 'data()' instead
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const { return m_map.data(); }
+#endif
+
+  //----------------------------------------
+  // Allow specializations to query their specialized map
+
+  KOKKOS_INLINE_FUNCTION
+  const Kokkos::Impl::ViewMapping< traits , void > &
+  implementation_map() const { return m_map ; }
+
+  //----------------------------------------
+
+private:
+
+  enum {
+    is_layout_left = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutLeft >::value ,
+
+    is_layout_right = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutRight >::value ,
+
+    is_layout_stride = std::is_same< typename traits::array_layout
+                                   , Kokkos::LayoutStride >::value ,
+
+    is_default_map =
+      std::is_same< typename traits::specialize , void >::value &&
+      ( is_layout_left || is_layout_right || is_layout_stride )
+  };
+
+  template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space
+    { KOKKOS_FORCEINLINE_FUNCTION static void check() {} };
+
+  template< class Space > struct verify_space<Space,false>
+    { KOKKOS_FORCEINLINE_FUNCTION static void check()
+        { Kokkos::abort("Kokkos::View ERROR: attempt to access inaccessible memory space"); };
+    };
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+
+#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( ARG ) \
+  View::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); \
+  Kokkos::Impl::view_verify_operator_bounds< typename traits::memory_space > ARG ;
+
+#else
+
+#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( ARG ) \
+  View::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
+
+#endif
+
+public:
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+  template< class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Kokkos::Impl::are_integral<Args...>::value
+                            && ( 0 == Rank )
+                          ), reference_type >::type
+  operator()( Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,args...) )
+      return m_map.reference();
+    }
+
+  template< typename I0
+             , class ... Args>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,Args...>::value
+      && ( 1 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0,
+              Args ... args) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
+      return m_map.reference(i0);
+    }
+
+  template< typename I0
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,Args...>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && ! is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
+      return m_map.m_handle[ i0 ];
+    }
+
+  template< typename I0
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,Args...>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
+      return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
+    }
+
+  //------------------------------
+    // Rank 1 operator[]
+
+    template< typename I0 >
+    KOKKOS_FORCEINLINE_FUNCTION
+    typename std::enable_if<
+      ( Kokkos::Impl::are_integral<I0>::value
+        && ( 1 == Rank )
+        && ! is_default_map
+      ), reference_type >::type
+    operator[]( const I0 & i0 ) const
+      {
+        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
+        return m_map.reference(i0);
+      }
+
+    template< typename I0 >
+    KOKKOS_FORCEINLINE_FUNCTION
+    typename std::enable_if<
+      ( Kokkos::Impl::are_integral<I0>::value
+        && ( 1 == Rank )
+        && is_default_map
+        && ! is_layout_stride
+      ), reference_type >::type
+    operator[]( const I0 & i0 ) const
+      {
+        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
+        return m_map.m_handle[ i0 ];
+      }
+
+    template< typename I0 >
+    KOKKOS_FORCEINLINE_FUNCTION
+    typename std::enable_if<
+      ( Kokkos::Impl::are_integral<I0>::value
+        && ( 1 == Rank )
+        && is_default_map
+        && is_layout_stride
+      ), reference_type >::type
+    operator[]( const I0 & i0 ) const
+      {
+        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
+        return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
+      }
+
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+      return m_map.reference(i0,i1);
+    }
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_left && ( traits::rank_dynamic == 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+      return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ];
+    }
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_left && ( traits::rank_dynamic != 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+      return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ];
+    }
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_right && ( traits::rank_dynamic == 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+      return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ];
+    }
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_right && ( traits::rank_dynamic != 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+      return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ];
+    }
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+      return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 +
+                             i1 * m_map.m_offset.m_stride.S1 ];
+    }
+
+  //------------------------------
+  // Rank 3
+
+  template< typename I0 , typename I1 , typename I2
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value
+      && ( 3 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value
+      && ( 3 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
+      return m_map.reference(i0,i1,i2);
+    }
+
+  //------------------------------
+  // Rank 4
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value
+      && ( 4 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value
+      && ( 4 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
+      return m_map.reference(i0,i1,i2,i3);
+    }
+
+  //------------------------------
+  // Rank 5
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value
+      && ( 5 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value
+      && ( 5 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
+      return m_map.reference(i0,i1,i2,i3,i4);
+    }
+
+  //------------------------------
+  // Rank 6
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value
+      && ( 6 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value
+      && ( 6 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5);
+    }
+
+  //------------------------------
+  // Rank 7
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value
+      && ( 7 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value
+      && ( 7 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
+    }
+
+  //------------------------------
+  // Rank 8
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value
+      && ( 8 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value
+      && ( 8 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
+            , Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7);
+    }
+
+
+ #else
+  //------------------------------
+  // Rank 0 operator()
+
+ KOKKOS_FORCEINLINE_FUNCTION
+  reference_type
+  operator()() const
+    {
+      return m_map.reference();
+    }
+  //------------------------------
+  // Rank 1 operator()
+
+
+  template< typename I0>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0>::value
+      && ( 1 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
+      return m_map.reference(i0);
+    }
+
+  template< typename I0>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && ! is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
+      return m_map.m_handle[ i0 ];
+    }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
+      return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
+    }
+  //------------------------------
+    // Rank 1 operator[]
+
+    template< typename I0 >
+    KOKKOS_FORCEINLINE_FUNCTION
+    typename std::enable_if<
+      ( Kokkos::Impl::are_integral<I0>::value
+        && ( 1 == Rank )
+        && ! is_default_map
+      ), reference_type >::type
+    operator[]( const I0 & i0 ) const
+      {
+        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
+        return m_map.reference(i0);
+      }
+
+    template< typename I0 >
+    KOKKOS_FORCEINLINE_FUNCTION
+    typename std::enable_if<
+      ( Kokkos::Impl::are_integral<I0>::value
+        && ( 1 == Rank )
+        && is_default_map
+        && ! is_layout_stride
+      ), reference_type >::type
+    operator[]( const I0 & i0 ) const
+      {
+        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
+        return m_map.m_handle[ i0 ];
+      }
+
+    template< typename I0 >
+    KOKKOS_FORCEINLINE_FUNCTION
+    typename std::enable_if<
+      ( Kokkos::Impl::are_integral<I0>::value
+        && ( 1 == Rank )
+        && is_default_map
+        && is_layout_stride
+      ), reference_type >::type
+    operator[]( const I0 & i0 ) const
+      {
+        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
+        return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
+      }
+
+
+    //------------------------------
+  // Rank 2
+
+  template< typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1>::value
+      && ( 2 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1) )
+      return m_map.reference(i0,i1);
+    }
+
+  template< typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_left && ( traits::rank_dynamic == 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1) )
+      return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ];
+    }
+
+  template< typename I0 , typename I1>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_left && ( traits::rank_dynamic != 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1) )
+      return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ];
+    }
+
+  template< typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_right && ( traits::rank_dynamic == 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1) )
+      return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ];
+    }
+
+  template< typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_right && ( traits::rank_dynamic != 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1) )
+      return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ];
+    }
+
+  template< typename I0 , typename I1>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1) )
+      return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 +
+                             i1 * m_map.m_offset.m_stride.S1 ];
+    }
+
+  //------------------------------
+  // Rank 3
+
+  template< typename I0 , typename I1 , typename I2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2>::value
+      && ( 3 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2>::value
+      && ( 3 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2) )
+      return m_map.reference(i0,i1,i2);
+    }
+
+  //------------------------------
+  // Rank 4
+
+  template< typename I0 , typename I1 , typename I2 , typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3>::value
+      && ( 4 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3>::value
+      && ( 4 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3) )
+      return m_map.reference(i0,i1,i2,i3);
+    }
+
+  //------------------------------
+  // Rank 5
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4>::value
+      && ( 5 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4>::value
+      && ( 5 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4) )
+      return m_map.reference(i0,i1,i2,i3,i4);
+    }
+
+  //------------------------------
+  // Rank 6
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5>::value
+      && ( 6 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5>::value
+      && ( 6 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5);
+    }
+
+  //------------------------------
+  // Rank 7
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6>::value
+      && ( 7 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6>::value
+      && ( 7 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
+    }
+
+  //------------------------------
+  // Rank 8
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7>::value
+      && ( 8 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7) )
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7>::value
+      && ( 8 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7);
+    }
+
+#endif
+
+  template< class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Kokkos::Impl::are_integral<Args...>::value
+                            && ( 0 == Rank )
+                          ), reference_type >::type
+  access( Args ... args ) const
+    {
+      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,args...) )
+      return m_map.reference();
+    }
+
+   template< typename I0
+               , class ... Args>
+    KOKKOS_FORCEINLINE_FUNCTION
+    typename std::enable_if<
+      ( Kokkos::Impl::are_integral<I0,Args...>::value
+        && ( 1 == Rank )
+        && ! is_default_map
+      ), reference_type >::type
+    access( const I0 & i0,
+                Args ... args) const
+      {
+        KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
+        return m_map.reference(i0);
+      }
+
+   template< typename I0
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,Args...>::value
+       && ( 1 == Rank )
+       && is_default_map
+       && ! is_layout_stride
+     ), reference_type >::type
+   access( const I0 & i0
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
+       return m_map.m_handle[ i0 ];
+     }
+
+   template< typename I0
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,Args...>::value
+       && ( 1 == Rank )
+       && is_default_map
+       && is_layout_stride
+     ), reference_type >::type
+   access( const I0 & i0
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
+       return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
+     }
+
+   template< typename I0 , typename I1
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+       && ( 2 == Rank )
+       && ! is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+       return m_map.reference(i0,i1);
+     }
+
+   template< typename I0 , typename I1
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+       && ( 2 == Rank )
+       && is_default_map
+       && is_layout_left && ( traits::rank_dynamic == 0 )
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+       return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ];
+     }
+
+   template< typename I0 , typename I1
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+       && ( 2 == Rank )
+       && is_default_map
+       && is_layout_left && ( traits::rank_dynamic != 0 )
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+       return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ];
+     }
+
+   template< typename I0 , typename I1
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+       && ( 2 == Rank )
+       && is_default_map
+       && is_layout_right && ( traits::rank_dynamic == 0 )
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+       return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ];
+     }
+
+   template< typename I0 , typename I1
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+       && ( 2 == Rank )
+       && is_default_map
+       && is_layout_right && ( traits::rank_dynamic != 0 )
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+       return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ];
+     }
+
+   template< typename I0 , typename I1
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+       && ( 2 == Rank )
+       && is_default_map
+       && is_layout_stride
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
+       return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 +
+                              i1 * m_map.m_offset.m_stride.S1 ];
+     }
+
+   //------------------------------
+   // Rank 3
+
+   template< typename I0 , typename I1 , typename I2
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value
+       && ( 3 == Rank )
+       && is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
+       return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ];
+     }
+
+   template< typename I0 , typename I1 , typename I2
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value
+       && ( 3 == Rank )
+       && ! is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
+       return m_map.reference(i0,i1,i2);
+     }
+
+   //------------------------------
+   // Rank 4
+
+   template< typename I0 , typename I1 , typename I2 , typename I3
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value
+       && ( 4 == Rank )
+       && is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
+       return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ];
+     }
+
+   template< typename I0 , typename I1 , typename I2 , typename I3
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value
+       && ( 4 == Rank )
+       && ! is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
+       return m_map.reference(i0,i1,i2,i3);
+     }
+
+   //------------------------------
+   // Rank 5
+
+   template< typename I0 , typename I1 , typename I2 , typename I3
+           , typename I4
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value
+       && ( 5 == Rank )
+       && is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+             , const I4 & i4
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
+       return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ];
+     }
+
+   template< typename I0 , typename I1 , typename I2 , typename I3
+           , typename I4
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value
+       && ( 5 == Rank )
+       && ! is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+             , const I4 & i4
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
+       return m_map.reference(i0,i1,i2,i3,i4);
+     }
+
+   //------------------------------
+   // Rank 6
+
+   template< typename I0 , typename I1 , typename I2 , typename I3
+           , typename I4 , typename I5
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value
+       && ( 6 == Rank )
+       && is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+             , const I4 & i4 , const I5 & i5
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
+       return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ];
+     }
+
+   template< typename I0 , typename I1 , typename I2 , typename I3
+           , typename I4 , typename I5
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value
+       && ( 6 == Rank )
+       && ! is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+             , const I4 & i4 , const I5 & i5
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
+       return m_map.reference(i0,i1,i2,i3,i4,i5);
+     }
+
+   //------------------------------
+   // Rank 7
+
+   template< typename I0 , typename I1 , typename I2 , typename I3
+           , typename I4 , typename I5 , typename I6
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value
+       && ( 7 == Rank )
+       && is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+             , const I4 & i4 , const I5 & i5 , const I6 & i6
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
+       return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ];
+     }
+
+   template< typename I0 , typename I1 , typename I2 , typename I3
+           , typename I4 , typename I5 , typename I6
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value
+       && ( 7 == Rank )
+       && ! is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+             , const I4 & i4 , const I5 & i5 , const I6 & i6
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
+       return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
+     }
+
+   //------------------------------
+   // Rank 8
+
+   template< typename I0 , typename I1 , typename I2 , typename I3
+           , typename I4 , typename I5 , typename I6 , typename I7
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value
+       && ( 8 == Rank )
+       && is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+             , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
+       return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ];
+     }
+
+   template< typename I0 , typename I1 , typename I2 , typename I3
+           , typename I4 , typename I5 , typename I6 , typename I7
+           , class ... Args >
+   KOKKOS_FORCEINLINE_FUNCTION
+   typename std::enable_if<
+     ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value
+       && ( 8 == Rank )
+       && ! is_default_map
+     ), reference_type >::type
+   access( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+             , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
+             , Args ... args ) const
+     {
+       KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
+       return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7);
+     }
+
+
+#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY
+
+  //----------------------------------------
+  // Standard destructor, constructors, and assignment operators
+
+  KOKKOS_INLINE_FUNCTION
+  ~View() {}
+
+  KOKKOS_INLINE_FUNCTION
+  View() : m_track(), m_map() {}
+
+  KOKKOS_INLINE_FUNCTION
+  View( const View & rhs ) : m_track( rhs.m_track, traits::is_managed ), m_map( rhs.m_map ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  View( View && rhs ) : m_track( std::move(rhs.m_track) ), m_map( std::move(rhs.m_map) ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View & rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( View && rhs ) { m_track = std::move(rhs.m_track) ; m_map = std::move(rhs.m_map) ; return *this ; }
+
+  //----------------------------------------
+  // Compatible view copy constructor and assignment
+  // may assign unmanaged from managed.
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<RT,RP...> & rhs )
+    : m_track( rhs.m_track , traits::is_managed )
+    , m_map()
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View copy construction" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+    }
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View<RT,RP...> & rhs )
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View copy assignment" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+      m_track.assign( rhs.m_track , traits::is_managed );
+      return *this ;
+    }
+
+  //----------------------------------------
+  // Compatible subview constructor
+  // may assign unmanaged from managed.
+
+  template< class RT , class ... RP , class Arg0 , class ... Args >
+  KOKKOS_INLINE_FUNCTION
+  View( const View< RT , RP... > & src_view
+      , const Arg0 & arg0 , Args ... args )
+    : m_track( src_view.m_track , traits::is_managed )
+    , m_map()
+    {
+      typedef View< RT , RP... > SrcType ;
+
+      typedef Kokkos::Impl::ViewMapping
+        < void /* deduce destination view type from source view traits */
+        , typename SrcType::traits
+        , Arg0 , Args... > Mapping ;
+
+      typedef typename Mapping::type DstType ;
+
+      static_assert( Kokkos::Impl::ViewMapping< traits , typename DstType::traits , void >::is_assignable
+        , "Subview construction requires compatible view and subview arguments" );
+
+      Mapping::assign( m_map, src_view.m_map, arg0 , args... );
+    }
+
+  //----------------------------------------
+  // Allocation tracking properties
+
+  KOKKOS_INLINE_FUNCTION
+  int use_count() const
+    { return m_track.use_count(); }
+
+  inline
+  const std::string label() const
+    { return m_track.template get_label< typename traits::memory_space >(); }
+
+  //----------------------------------------
+  // Allocation according to allocation properties and array layout
+
+  template< class ... P >
+  explicit inline
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer
+                               , typename traits::array_layout
+                               >::type const & arg_layout
+      )
+    : m_track()
+    , m_map()
+    {
+      // Append layout and spaces if not input
+      typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
+
+      // use 'std::integral_constant<unsigned,I>' for non-types
+      // to avoid duplicate class error.
+      typedef Impl::ViewCtorProp
+        < P ...
+        , typename std::conditional
+            < alloc_prop_input::has_label
+            , std::integral_constant<unsigned,0>
+            , typename std::string
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_memory_space
+            , std::integral_constant<unsigned,1>
+            , typename traits::device_type::memory_space
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_execution_space
+            , std::integral_constant<unsigned,2>
+            , typename traits::device_type::execution_space
+            >::type
+        > alloc_prop ;
+
+      static_assert( traits::is_managed
+                   , "View allocation constructor requires managed memory" );
+
+      if ( alloc_prop::initialize &&
+           ! alloc_prop::execution_space::is_initialized() ) {
+        // If initializing view data then
+        // the execution space must be initialized.
+        Kokkos::Impl::throw_runtime_exception("Constructing View and initializing data with uninitialized execution space");
+      }
+
+      // Copy the input allocation properties with possibly defaulted properties
+      alloc_prop prop( arg_prop );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_ENABLE_CUDA )
+      // If allocating in CudaUVMSpace must fence before and after
+      // the allocation to protect against possible concurrent access
+      // on the CPU and the GPU.
+      // Fence using the trait's executon space (which will be Kokkos::Cuda)
+      // to avoid incomplete type errors from usng Kokkos::Cuda directly.
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
+
+      Kokkos::Impl::SharedAllocationRecord<> *
+        record = m_map.allocate_shared( prop , arg_layout );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_ENABLE_CUDA )
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
+
+      // Setup and initialization complete, start tracking
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign_data( pointer_type arg_data )
+    {
+      m_track.clear();
+      m_map.assign_data( arg_data );
+    }
+
+  // Wrap memory according to properties and array layout
+  template< class ... P >
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer
+                               , typename traits::array_layout
+                               >::type const & arg_layout
+      )
+    : m_track() // No memory tracking
+    , m_map( arg_prop , arg_layout )
+    {
+      static_assert(
+        std::is_same< pointer_type
+                    , typename Impl::ViewCtorProp< P... >::pointer_type
+                    >::value ,
+        "Constructing View to wrap user memory must supply matching pointer type" );
+    }
+
+  // Simple dimension-only layout
+  template< class ... P >
+  explicit inline
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer
+                               , size_t
+                               >::type const arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( arg_prop
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  template< class ... P >
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer
+                               , size_t
+                               >::type const arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( arg_prop
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  // Allocate with label and layout
+  template< typename Label >
+  explicit inline
+  View( const Label & arg_label
+      , typename std::enable_if<
+          Kokkos::Impl::is_view_label<Label>::value ,
+          typename traits::array_layout >::type const & arg_layout
+      )
+    : View( Impl::ViewCtorProp< std::string >( arg_label ) , arg_layout )
+    {}
+
+  // Allocate label and layout, must disambiguate from subview constructor.
+  template< typename Label >
+  explicit inline
+  View( const Label & arg_label
+      , typename std::enable_if<
+          Kokkos::Impl::is_view_label<Label>::value ,
+        const size_t >::type arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( Impl::ViewCtorProp< std::string >( arg_label )
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  // For backward compatibility
+  explicit inline
+  View( const ViewAllocateWithoutInitializing & arg_prop
+      , const typename traits::array_layout & arg_layout
+      )
+    : View( Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing )
+          , arg_layout
+          )
+    {}
+
+  explicit inline
+  View( const ViewAllocateWithoutInitializing & arg_prop
+      , const size_t arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing )
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  //----------------------------------------
+  // Memory span required to wrap these dimensions.
+  static constexpr size_t required_allocation_size(
+                                       const size_t arg_N0 = 0
+                                     , const size_t arg_N1 = 0
+                                     , const size_t arg_N2 = 0
+                                     , const size_t arg_N3 = 0
+                                     , const size_t arg_N4 = 0
+                                     , const size_t arg_N5 = 0
+                                     , const size_t arg_N6 = 0
+                                     , const size_t arg_N7 = 0
+                                     )
+    {
+      return map_type::memory_span(
+        typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+          , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+    }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( pointer_type arg_ptr
+      , const size_t arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( Impl::ViewCtorProp<pointer_type>(arg_ptr)
+          , typename traits::array_layout
+             ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+             , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( pointer_type arg_ptr
+      , const typename traits::array_layout & arg_layout
+      )
+    : View( Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_layout )
+    {}
+
+  //----------------------------------------
+  // Shared scratch memory constructor
+
+  static inline
+  size_t
+  shmem_size( const size_t arg_N0 = ~size_t(0) ,
+              const size_t arg_N1 = ~size_t(0) ,
+              const size_t arg_N2 = ~size_t(0) ,
+              const size_t arg_N3 = ~size_t(0) ,
+              const size_t arg_N4 = ~size_t(0) ,
+              const size_t arg_N5 = ~size_t(0) ,
+              const size_t arg_N6 = ~size_t(0) ,
+              const size_t arg_N7 = ~size_t(0) )
+  {
+    if ( is_layout_stride ) {
+      Kokkos::abort( "Kokkos::View::shmem_size(extents...) doesn't work with LayoutStride. Pass a LayoutStride object instead" );
+    }
+    const size_t num_passed_args =
+      ( arg_N0 != ~size_t(0) ) + ( arg_N1 != ~size_t(0) ) + ( arg_N2 != ~size_t(0) ) +
+      ( arg_N3 != ~size_t(0) ) + ( arg_N4 != ~size_t(0) ) + ( arg_N5 != ~size_t(0) ) +
+      ( arg_N6 != ~size_t(0) ) + ( arg_N7 != ~size_t(0) );
+
+    if ( std::is_same<typename traits::specialize,void>::value && num_passed_args != traits::rank_dynamic ) {
+      Kokkos::abort( "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n" );
+    }
+
+    return View::shmem_size(
+           typename traits::array_layout
+            ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+            , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+  }
+
+  static inline
+  size_t shmem_size( typename traits::array_layout const& arg_layout )
+  {
+    return map_type::memory_span( arg_layout );
+  }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const typename traits::array_layout & arg_layout )
+    : View( Impl::ViewCtorProp<pointer_type>(
+              reinterpret_cast<pointer_type>(
+                arg_space.get_shmem( map_type::memory_span( arg_layout ) ) ) )
+         , arg_layout )
+    {}
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const size_t arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0 )
+    : View( Impl::ViewCtorProp<pointer_type>(
+              reinterpret_cast<pointer_type>(
+                arg_space.get_shmem(
+                  map_type::memory_span(
+                    typename traits::array_layout
+                     ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+                     , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) )
+          , typename traits::array_layout
+             ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+             , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+       )
+    {}
+};
+
+
+ /** \brief Temporary free function rank()
+  *         until rank() is implemented
+  *         in the View
+  */
+  template < typename D , class ... P >
+  KOKKOS_INLINE_FUNCTION
+  constexpr unsigned rank( const View<D , P...> & V ) { return V.Rank; } //Temporary until added to view
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class V , class ... Args >
+using Subview =
+  typename Kokkos::Impl::ViewMapping
+    < void /* deduce subview type from source view traits */
+    , typename V::traits
+    , Args ...
+    >::type ;
+
+template< class D, class ... P , class ... Args >
+KOKKOS_INLINE_FUNCTION
+typename Kokkos::Impl::ViewMapping
+  < void /* deduce subview type from source view traits */
+  , ViewTraits< D , P... >
+  , Args ...
+  >::type
+subview( const View< D, P... > & src , Args ... args )
+{
+  static_assert( View< D , P... >::Rank == sizeof...(Args) ,
+    "subview requires one argument for each source View rank" );
+
+  return typename
+    Kokkos::Impl::ViewMapping
+      < void /* deduce subview type from source view traits */
+      , ViewTraits< D , P ... >
+      , Args ... >::type( src , args ... );
+}
+
+template< class MemoryTraits , class D, class ... P , class ... Args >
+KOKKOS_INLINE_FUNCTION
+typename Kokkos::Impl::ViewMapping
+  < void /* deduce subview type from source view traits */
+  , ViewTraits< D , P... >
+  , Args ...
+  >::template apply< MemoryTraits >::type
+subview( const View< D, P... > & src , Args ... args )
+{
+  static_assert( View< D , P... >::Rank == sizeof...(Args) ,
+    "subview requires one argument for each source View rank" );
+
+  return typename
+    Kokkos::Impl::ViewMapping
+      < void /* deduce subview type from source view traits */
+      , ViewTraits< D , P ... >
+      , Args ... >
+      ::template apply< MemoryTraits >
+      ::type( src , args ... );
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class LT , class ... LP , class RT , class ... RP >
+KOKKOS_INLINE_FUNCTION
+bool operator == ( const View<LT,LP...> & lhs ,
+                   const View<RT,RP...> & rhs )
+{
+  // Same data, layout, dimensions
+  typedef ViewTraits<LT,LP...>  lhs_traits ;
+  typedef ViewTraits<RT,RP...>  rhs_traits ;
+
+  return
+    std::is_same< typename lhs_traits::const_value_type ,
+                  typename rhs_traits::const_value_type >::value &&
+    std::is_same< typename lhs_traits::array_layout ,
+                  typename rhs_traits::array_layout >::value &&
+    std::is_same< typename lhs_traits::memory_space ,
+                  typename rhs_traits::memory_space >::value &&
+    unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) &&
+    lhs.data()        == rhs.data() &&
+    lhs.span()        == rhs.span() &&
+    lhs.extent(0) == rhs.extent(0) &&
+    lhs.extent(1) == rhs.extent(1) &&
+    lhs.extent(2) == rhs.extent(2) &&
+    lhs.extent(3) == rhs.extent(3) &&
+    lhs.extent(4) == rhs.extent(4) &&
+    lhs.extent(5) == rhs.extent(5) &&
+    lhs.extent(6) == rhs.extent(6) &&
+    lhs.extent(7) == rhs.extent(7);
+}
+
+template< class LT , class ... LP , class RT , class ... RP >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const View<LT,LP...> & lhs ,
+                   const View<RT,RP...> & rhs )
+{
+  return ! ( operator==(lhs,rhs) );
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+inline
+void shared_allocation_tracking_disable()
+{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_disable(); }
+
+inline
+void shared_allocation_tracking_enable()
+{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_enable(); }
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Deduce Mirror Types
+template<class Space, class T, class ... P>
+struct MirrorViewType {
+  // The incoming view_type
+  typedef typename Kokkos::View<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::View<data_type,array_layout,Space> dest_view_type;
+  // If it is the same memory_space return the existsing view_type
+  // This will also keep the unmanaged trait if necessary
+  typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type;
+};
+
+template<class Space, class T, class ... P>
+struct MirrorType {
+  // The incoming view_type
+  typedef typename Kokkos::View<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::View<data_type,array_layout,Space> view_type;
+};
+
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::View<T,P...>::HostMirror
+create_mirror( const Kokkos::View<T,P...> & src
+             , typename std::enable_if<
+                 ! std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
+                               , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef View<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  return dst_type( std::string( src.label() ).append("_mirror")
+                 , src.extent(0)
+                 , src.extent(1)
+                 , src.extent(2)
+                 , src.extent(3)
+                 , src.extent(4)
+                 , src.extent(5)
+                 , src.extent(6)
+                 , src.extent(7) );
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::View<T,P...>::HostMirror
+create_mirror( const Kokkos::View<T,P...> & src
+             , typename std::enable_if<
+                 std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
+                             , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef View<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  Kokkos::LayoutStride layout ;
+
+  layout.dimension[0] = src.extent(0);
+  layout.dimension[1] = src.extent(1);
+  layout.dimension[2] = src.extent(2);
+  layout.dimension[3] = src.extent(3);
+  layout.dimension[4] = src.extent(4);
+  layout.dimension[5] = src.extent(5);
+  layout.dimension[6] = src.extent(6);
+  layout.dimension[7] = src.extent(7);
+
+  layout.stride[0] = src.stride_0();
+  layout.stride[1] = src.stride_1();
+  layout.stride[2] = src.stride_2();
+  layout.stride[3] = src.stride_3();
+  layout.stride[4] = src.stride_4();
+  layout.stride[5] = src.stride_5();
+  layout.stride[6] = src.stride_6();
+  layout.stride[7] = src.stride_7();
+
+  return dst_type( std::string( src.label() ).append("_mirror") , layout );
+}
+
+
+// Create a mirror in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::View<T,P...> & src) {
+  return typename Impl::MirrorType<Space,T,P ...>::view_type(src.label(),src.layout());
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::View<T,P...>::HostMirror
+create_mirror_view( const Kokkos::View<T,P...> & src
+                  , typename std::enable_if<(
+                      std::is_same< typename Kokkos::View<T,P...>::memory_space
+                                  , typename Kokkos::View<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename Kokkos::View<T,P...>::data_type
+                                  , typename Kokkos::View<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
+{
+  return src ;
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::View<T,P...>::HostMirror
+create_mirror_view( const Kokkos::View<T,P...> & src
+                  , typename std::enable_if< ! (
+                      std::is_same< typename Kokkos::View<T,P...>::memory_space
+                                  , typename Kokkos::View<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename Kokkos::View<T,P...>::data_type
+                                  , typename Kokkos::View<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
+{
+  return Kokkos::create_mirror( src );
+}
+
+// Create a mirror view in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
+  , typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return src;
+}
+
+// Create a mirror view in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
+  , typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
+}
+
+// Create a mirror view and deep_copy in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
+  , std::string const& name = ""
+  , typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  (void)name;
+  return src;
+}
+
+// Create a mirror view and deep_copy in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
+  , std::string const& name = ""
+  , typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  using Mirror = typename Impl::MirrorViewType<Space,T,P ...>::view_type;
+  std::string label = name.empty() ? src.label() : name;
+  auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout());
+  deep_copy(mirror, src);
+  return mirror;
+}
+
+} /* namespace Kokkos */
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Impl {
+
+template < class Specialize, typename A, typename B >
+struct CommonViewValueType;
+
+template < typename A, typename B >
+struct CommonViewValueType< void, A, B >
+{
+  using value_type = typename std::common_type< A , B >::type;
+};
+
+
+template < class Specialize, class ValueType >
+struct CommonViewAllocProp;
+
+template < class ValueType >
+struct CommonViewAllocProp< void, ValueType >
+{
+  using value_type = ValueType;
+  using scalar_array_type = ValueType;
+
+  template < class ... Views >
+  KOKKOS_INLINE_FUNCTION
+  CommonViewAllocProp( const Views & ... ) {}
+};
+
+
+template < class ... Views >
+struct DeduceCommonViewAllocProp;
+
+// Base case must provide types for:
+// 1. specialize  2. value_type  3. is_view  4. prop_type
+template < class FirstView >
+struct DeduceCommonViewAllocProp< FirstView >
+{
+  using specialize = typename FirstView::traits::specialize;
+
+  using value_type = typename FirstView::traits::value_type;
+
+  enum : bool { is_view = is_view< FirstView >::value };
+
+  using prop_type = CommonViewAllocProp< specialize, value_type >;
+};
+
+
+template < class FirstView, class ... NextViews >
+struct DeduceCommonViewAllocProp< FirstView, NextViews... >
+{
+  using NextTraits = DeduceCommonViewAllocProp< NextViews... >;
+
+  using first_specialize = typename FirstView::traits::specialize;
+  using first_value_type = typename FirstView::traits::value_type;
+
+  enum : bool { first_is_view = is_view< FirstView >::value };
+
+  using next_specialize = typename NextTraits::specialize;
+  using next_value_type = typename NextTraits::value_type;
+
+  enum : bool { next_is_view = NextTraits::is_view };
+
+  // common types
+
+  // determine specialize type
+  // if first and next specialize differ, but are not the same specialize, error out
+  static_assert( !(!std::is_same< first_specialize, next_specialize >::value && !std::is_same< first_specialize, void>::value && !std::is_same< void, next_specialize >::value)  , "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void specialize trait allowed" );
+
+  // otherwise choose non-void specialize if either/both are non-void
+  using specialize = typename std::conditional< std::is_same< first_specialize, next_specialize >::value
+                                              , first_specialize
+                                              , typename std::conditional< ( std::is_same< first_specialize, void >::value
+                                                                             && !std::is_same< next_specialize, void >::value)
+                                                                           , next_specialize
+                                                                           , first_specialize
+                                                                         >::type
+                                               >::type;
+
+  using value_type = typename CommonViewValueType< specialize, first_value_type, next_value_type >::value_type;
+
+  enum : bool { is_view = (first_is_view && next_is_view) };
+
+  using prop_type = CommonViewAllocProp< specialize, value_type >;
+};
+
+} // end namespace Impl
+
+template < class ... Views >
+using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp<Views...>::prop_type ;
+
+// User function
+template < class ... Views >
+KOKKOS_INLINE_FUNCTION
+DeducedCommonPropsType<Views...> 
+common_view_alloc_prop( Views const & ... views )
+{
+  return DeducedCommonPropsType<Views...>( views... );
+}
+
+} // namespace Kokkos
+
+
+namespace Kokkos {
+namespace Impl {
+
+using Kokkos::is_view ;
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#include <impl/Kokkos_Atomic_View.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEW_HPP */
+
diff --git a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..352d6316b4a6e062a17f1ed09503dfab3b21b015
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@@ -0,0 +1,258 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_WORKGRAPHPOLICY_HPP
+#define KOKKOS_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class functor_type , class execution_space, class ... policy_args >
+class WorkGraphExec;
+
+}} // namespace Kokkos::Impl
+
+namespace Kokkos {
+
+template< class ... Properties >
+class WorkGraphPolicy
+{
+public:
+
+  using self_type       = WorkGraphPolicy<Properties ... >;
+  using traits          = Kokkos::Impl::PolicyTraits<Properties ... >;
+  using index_type      = typename traits::index_type;
+  using member_type     = index_type;
+  using work_tag        = typename traits::work_tag;
+  using execution_space = typename traits::execution_space;
+  using memory_space    = typename execution_space::memory_space;
+  using graph_type      = Kokkos::Crs<index_type,execution_space,void,index_type>;
+
+  enum : std::int32_t {
+    END_TOKEN       = -1 ,
+    BEGIN_TOKEN     = -2 ,
+    COMPLETED_TOKEN = -3 };
+
+private:
+
+  using ints_type = Kokkos::View<std::int32_t*, memory_space>;
+
+  // Let N = m_graph.numRows(), the total work
+  // m_queue[  0 ..   N-1] = the ready queue
+  // m_queue[  N .. 2*N-1] = the waiting queue counts
+  // m_queue[2*N .. 2*N+2] = the ready queue hints
+
+  graph_type const m_graph;
+  ints_type        m_queue ;
+
+  KOKKOS_INLINE_FUNCTION
+  void push_work( const std::int32_t w ) const noexcept
+    {
+      const std::int32_t N = m_graph.numRows();
+
+      std::int32_t volatile * const ready_queue = & m_queue[0] ;
+      std::int32_t volatile * const end_hint    = & m_queue[2*N+1] ;
+
+      // Push work to end of queue
+      const std::int32_t j = atomic_fetch_add( end_hint , 1 );
+
+      if ( ( N <= j ) ||
+           ( END_TOKEN != atomic_exchange(ready_queue+j,w) ) ) {
+        // ERROR: past the end of queue or did not replace END_TOKEN
+        Kokkos::abort("WorkGraphPolicy push_work error");
+      }
+
+      memory_fence();
+    }
+
+public:
+
+  /**\brief  Attempt to pop the work item at the head of the queue.
+   *
+   *  Find entry 'i' such that
+   *    ( m_queue[i] != BEGIN_TOKEN ) AND
+   *    ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN )
+   *  if found then
+   *    increment begin hint
+   *    return atomic_exchange( m_queue[i] , BEGIN_TOKEN )
+   *  else if i < total work
+   *    return END_TOKEN
+   *  else
+   *    return COMPLETED_TOKEN
+   *  
+   */
+  KOKKOS_INLINE_FUNCTION
+  std::int32_t pop_work() const noexcept
+    {
+      const std::int32_t N = m_graph.numRows();
+
+      std::int32_t volatile * const ready_queue = & m_queue[0] ;
+      std::int32_t volatile * const begin_hint  = & m_queue[2*N] ;
+
+      // begin hint is guaranteed to be less than or equal to
+      // actual begin location in the queue.
+
+      for ( std::int32_t i = *begin_hint ; i < N ; ++i ) {
+
+        const std::int32_t w = ready_queue[i] ;
+
+        if ( w == END_TOKEN ) { return END_TOKEN ; }
+
+        if ( ( w != BEGIN_TOKEN ) &&
+             ( w == atomic_compare_exchange(ready_queue+i,w,(std::int32_t)BEGIN_TOKEN) ) ) {
+          // Attempt to claim ready work index succeeded,
+          // update the hint and return work index
+          atomic_increment( begin_hint );
+          return w ;
+        }
+        // arrive here when ready_queue[i] == BEGIN_TOKEN
+      }
+
+      return COMPLETED_TOKEN ;
+    }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void completed_work( std::int32_t w ) const noexcept
+    {
+      Kokkos::memory_fence();
+
+      // Make sure the completed work function's memory accesses are flushed.
+
+      const std::int32_t N = m_graph.numRows();
+
+      std::int32_t volatile * const count_queue = & m_queue[N] ;
+
+      const std::int32_t B = m_graph.row_map(w);
+      const std::int32_t E = m_graph.row_map(w+1);
+
+      for ( std::int32_t i = B ; i < E ; ++i ) {
+        const std::int32_t j = m_graph.entries(i);
+        if ( 1 == atomic_fetch_add(count_queue+j,-1) ) {
+          push_work(j);
+        }
+      }
+    }
+
+  struct TagInit {};
+  struct TagCount {};
+  struct TagReady {};
+
+  /**\brief  Initialize queue
+   *
+   *  m_queue[0..N-1] = END_TOKEN, the ready queue
+   *  m_queue[N..2*N-1] = 0, the waiting count queue
+   *  m_queue[2*N..2*N+1] = 0, begin/end hints for ready queue
+   */
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit , int i ) const noexcept
+    { m_queue[i] = i < m_graph.numRows() ? END_TOKEN : 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagCount , int i ) const noexcept
+    {
+      std::int32_t volatile * const count_queue =
+        & m_queue[ m_graph.numRows() ] ;
+
+      atomic_increment( count_queue + m_graph.entries[i] );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagReady , int w ) const noexcept
+    {
+      std::int32_t const * const count_queue =
+        & m_queue[ m_graph.numRows() ] ;
+
+      if ( 0 == count_queue[w] ) push_work(w);
+    }
+
+  WorkGraphPolicy( const graph_type & arg_graph )
+    : m_graph(arg_graph)
+    , m_queue( view_alloc( "queue" , WithoutInitializing )
+             , arg_graph.numRows() * 2 + 2 )
+  {
+    { // Initialize
+      using policy_type = RangePolicy<std::int32_t, execution_space, TagInit>;
+      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+      const closure_type closure(*this, policy_type(0, m_queue.size()));
+      closure.execute();
+      execution_space::fence();
+    }
+
+    { // execute-after counts
+      using policy_type = RangePolicy<std::int32_t, execution_space, TagCount>;
+      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+      const closure_type closure(*this,policy_type(0,m_graph.entries.size()));
+      closure.execute();
+      execution_space::fence();
+    }
+
+    { // Scheduling ready tasks
+      using policy_type = RangePolicy<std::int32_t, execution_space, TagReady>;
+      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+      const closure_type closure(*this,policy_type(0,m_graph.numRows()));
+      closure.execute();
+      execution_space::fence();
+    }
+  }
+};
+
+} // namespace Kokkos
+
+#ifdef KOKKOS_ENABLE_SERIAL
+#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp"
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+#include "OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp"
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+#include "Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp"
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+#include "Threads/Kokkos_Threads_WorkGraphPolicy.hpp"
+#endif
+
+#endif /* #define KOKKOS_WORKGRAPHPOLICY_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_hwloc.hpp b/packages/kokkos/core/src/Kokkos_hwloc.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..064e5816a5c5e4fbdad6c0f8cdd42033f0f11dbf
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_hwloc.hpp
@@ -0,0 +1,144 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HWLOC_HPP
+#define KOKKOS_HWLOC_HPP
+
+#include <utility>
+
+namespace Kokkos {
+
+/** \brief  Minimal subset of logical 'hwloc' functionality available
+ *          from http://www.open-mpi.org/projects/hwloc/.
+ *
+ *  The calls are NOT thread safe in order to avoid mutexes,
+ *  memory allocations, or other actions which could give the
+ *  runtime system an opportunity to migrate the threads or
+ *  touch allocated memory during the function calls.
+ *
+ *  All calls to these functions should be performed by a thread
+ *  when it has guaranteed exclusive access; e.g., for OpenMP
+ *  within a 'critical' region.
+ */
+namespace hwloc {
+
+/** \brief  Query if hwloc is available */
+bool available();
+
+/** \brief  Query number of available NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a NUMA region.
+ */
+unsigned get_available_numa_count();
+
+/** \brief  Query number of available cores per NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a set of cores.
+ */
+unsigned get_available_cores_per_numa();
+
+/** \brief  Query number of available "hard" threads per core; i.e., hyperthreads */
+unsigned get_available_threads_per_core();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Internal functions for binding persistent spawned threads.
+
+namespace Kokkos {
+namespace hwloc {
+
+/** \brief  Recommend mapping of threads onto cores.
+ *
+ * If thread_count == 0 then choose and set a value.
+ * If use_numa_count == 0 then choose and set a value.
+ * If use_cores_per_numa == 0 then choose and set a value.
+ *
+ * Return 0 if asynchronous,
+ * Return 1 if synchronous and threads_coord[0] is process core
+ */
+unsigned thread_mapping( const char * const label ,
+                         const bool allow_async ,
+                         unsigned & thread_count ,
+                         unsigned & use_numa_count ,
+                         unsigned & use_cores_per_numa ,
+                         std::pair<unsigned,unsigned> threads_coord[] );
+
+/** \brief  Query core-coordinate of the current thread
+ *          with respect to the core_topology.
+ *
+ *  As long as the thread is running within the
+ *  process binding the following condition holds.
+ *
+ *  core_coordinate.first  < core_topology.first
+ *  core_coordinate.second < core_topology.second
+ */
+std::pair<unsigned,unsigned> get_this_thread_coordinate();
+
+/** \brief  Bind the current thread to a core. */
+bool bind_this_thread( const std::pair<unsigned,unsigned> );
+
+
+/** \brief Can hwloc bind threads? */
+bool can_bind_threads();
+
+/** \brief  Bind the current thread to one of the cores in the list.
+ *          Set that entry to (~0,~0) and return the index.
+ *          If binding fails return ~0.
+ */
+unsigned bind_this_thread( const unsigned               coordinate_count ,
+                           std::pair<unsigned,unsigned> coordinate[] );
+
+/** \brief  Unbind the current thread back to the original process binding */
+bool unbind_this_thread();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_HWLOC_HPP */
+
diff --git a/packages/kokkos/core/src/Makefile b/packages/kokkos/core/src/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..6ee5fec7165a16af13f3ab687a271d0e92609734
--- /dev/null
+++ b/packages/kokkos/core/src/Makefile
@@ -0,0 +1,101 @@
+ifndef KOKKOS_PATH
+  MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+  KOKKOS_PATH = $(subst Makefile,,$(MAKEFILE_PATH))../..
+endif
+
+PREFIX ?= /usr/local/lib/kokkos
+
+default: build-lib
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX ?= $(KOKKOS_PATH)/bin/nvcc_wrapper
+else
+  CXX ?= g++
+endif
+
+CXXFLAGS ?= -O3
+LINK ?= $(CXX)
+LDFLAGS ?=
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+include $(KOKKOS_PATH)/core/src/Makefile.generate_header_lists
+include $(KOKKOS_PATH)/core/src/Makefile.generate_build_files
+
+CONDITIONAL_COPIES =
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  CONDITIONAL_COPIES += copy-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+  CONDITIONAL_COPIES += copy-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  CONDITIONAL_COPIES += copy-qthreads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+  CONDITIONAL_COPIES += copy-openmp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+  CONDITIONAL_COPIES += copy-rocm
+endif
+
+ifeq ($(KOKKOS_OS),CYGWIN)
+  COPY_FLAG = -u
+endif
+ifeq ($(KOKKOS_OS),Linux)
+  COPY_FLAG = -u
+endif
+ifeq ($(KOKKOS_OS),Darwin)
+  COPY_FLAG =
+endif
+
+ifeq ($(KOKKOS_DEBUG),"no")
+  KOKKOS_DEBUG_CMAKE = OFF
+else
+  KOKKOS_DEBUG_CMAKE = ON
+endif
+
+build-lib: $(KOKKOS_LINK_DEPENDS)
+
+mkdir:
+	mkdir -p $(PREFIX)
+	mkdir -p $(PREFIX)/bin
+	mkdir -p $(PREFIX)/include
+	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/include/impl
+
+copy-cuda: mkdir
+	mkdir -p $(PREFIX)/include/Cuda
+	cp $(COPY_FLAG) $(KOKKOS_HEADERS_CUDA) $(PREFIX)/include/Cuda
+
+copy-threads: mkdir
+	mkdir -p $(PREFIX)/include/Threads
+	cp $(COPY_FLAG) $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads
+
+copy-qthreads: mkdir
+	mkdir -p $(PREFIX)/include/Qthreads
+	cp $(COPY_FLAG) $(KOKKOS_HEADERS_QTHREADS) $(PREFIX)/include/Qthreads
+
+copy-openmp: mkdir
+	mkdir -p $(PREFIX)/include/OpenMP
+	cp $(COPY_FLAG) $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP
+
+copy-rocm: mkdir
+	mkdir -p $(PREFIX)/include/ROCm
+	cp $(COPY_FLAG) $(KOKKOS_HEADERS_ROCM) $(PREFIX)/include/ROCm
+
+install: mkdir $(CONDITIONAL_COPIES) build-lib generate_build_settings
+	cp $(COPY_FLAG) $(NVCC_WRAPPER) $(PREFIX)/bin
+	cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
+	cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
+	cp $(COPY_FLAG) $(KOKKOS_MAKEFILE)  $(PREFIX)
+	cp $(COPY_FLAG) $(KOKKOS_CMAKEFILE)  $(PREFIX)
+	cp $(COPY_FLAG) libkokkos.a $(PREFIX)/lib
+	cp $(COPY_FLAG) $(KOKKOS_CONFIG_HEADER) $(PREFIX)/include
+
+clean: kokkos-clean
+	rm -f $(KOKKOS_MAKEFILE) $(KOKKOS_CMAKEFILE) 
diff --git a/packages/kokkos/core/src/Makefile.generate_build_files b/packages/kokkos/core/src/Makefile.generate_build_files
new file mode 100644
index 0000000000000000000000000000000000000000..56d9596b04ba82bbe60fe771577bb3d8d7613927
--- /dev/null
+++ b/packages/kokkos/core/src/Makefile.generate_build_files
@@ -0,0 +1,111 @@
+# This file is responsible for generating files which will be used 
+# by build system (make and cmake) in scenarios where the kokkos library
+# gets installed before building the app 
+
+# These files are generated by this makefile
+KOKKOS_MAKEFILE=Makefile.kokkos
+KOKKOS_CMAKEFILE=kokkos_generated_settings.cmake
+
+ifeq ($(KOKKOS_DEBUG),"no")
+  KOKKOS_DEBUG_CMAKE = OFF
+else
+  KOKKOS_DEBUG_CMAKE = ON
+endif
+
+# Functions for generating makefile and cmake file
+# In calling these routines, do not put space after ,
+# e.g., $(call kokkos_append_var,KOKKOS_PATH,$(PREFIX))
+kokkos_append_makefile = echo $1 >> $(KOKKOS_MAKEFILE)
+kokkos_append_cmakefile = echo $1 >> $(KOKKOS_CMAKEFILE)
+
+kokkos_setvar_cmakefile = echo set\($1 $2\) >> $(KOKKOS_CMAKEFILE)
+kokkos_setlist_cmakefile = echo set\($1 \"$2\"\) >> $(KOKKOS_CMAKEFILE)
+
+kokkos_appendvar_makefile = echo $1 = $($(1)) >> $(KOKKOS_MAKEFILE)
+kokkos_appendvar2_makefile = echo $1 ?= $($(1)) >> $(KOKKOS_MAKEFILE)
+kokkos_appendvar_cmakefile = echo set\($1 $($(1)) CACHE $2 FORCE\) >> $(KOKKOS_CMAKEFILE)
+kokkos_appendval_makefile = echo $1 = $2 >> $(KOKKOS_MAKEFILE)
+kokkos_appendval_cmakefile = echo set\($1 $2 CACHE $3 FORCE\) >> $(KOKKOS_CMAKEFILE)
+kokkos_append_gmakevar_cmakefile = echo set\(KOKKOS_GMAKE_$(1:KOKKOS_%=%) \"$($(1))\" CACHE $2 FORCE\) >> $(KOKKOS_CMAKEFILE)
+
+kokkos_append_string = $(call kokkos_append_makefile,$1); $(call kokkos_append_cmakefile,$1)
+kokkos_append_var = $(call kokkos_appendvar_makefile,$1); $(call kokkos_appendvar_cmakefile,$1,$2)
+kokkos_append_var2 = $(call kokkos_appendvar2_makefile,$1); $(call kokkos_appendvar_cmakefile,$1,$2)
+kokkos_append_varval = $(call kokkos_appendval_makefile,$1,$2); $(call kokkos_appendval_cmakefile,$1,$2,$3)
+
+#This function should be used for variables whose values are different in GNU Make versus CMake,
+#especially lists which are delimited by commas in one case and semicolons in another
+kokkos_append_gmakevar = $(call kokkos_appendvar_makefile,$1); $(call kokkos_append_gmakevar_cmakefile,$1,$2)
+
+generate_build_settings: $(KOKKOS_CONFIG_HEADER)
+	@rm -f $(KOKKOS_MAKEFILE)
+	@rm -f $(KOKKOS_CMAKEFILE)
+	@$(call kokkos_append_string, "#Global Settings used to generate this library")
+	@$(call kokkos_append_varval,KOKKOS_PATH,$(KOKKOS_INSTALL_PATH),'FILEPATH "Kokkos installation path"')
+	@$(call kokkos_append_gmakevar,KOKKOS_DEVICES,'STRING "Kokkos devices list"')
+	@$(call kokkos_append_gmakevar,KOKKOS_ARCH,'STRING "Kokkos architecture flags"')
+	@$(call kokkos_appendvar_makefile,KOKKOS_DEBUG)
+	@$(call kokkos_appendvar_cmakefile,KOKKOS_DEBUG_CMAKE,'BOOL "Kokkos debug enabled ?"')
+	@$(call kokkos_append_gmakevar,KOKKOS_USE_TPLS,'STRING "Kokkos templates list"')
+	@$(call kokkos_append_var,KOKKOS_CXX_STANDARD,'STRING "Kokkos C++ standard"')
+	@$(call kokkos_append_gmakevar,KOKKOS_OPTIONS,'STRING "Kokkos options"')
+	@$(call kokkos_append_gmakevar,KOKKOS_CUDA_OPTIONS,'STRING "Kokkos Cuda options"')
+	@$(call kokkos_append_gmakevar,KOKKOS_TPL_INCLUDE_DIRS,'STRING "Kokkos TPL include directories"')
+	@$(call kokkos_append_gmakevar,KOKKOS_TPL_LIBRARY_DIRS,'STRING "Kokkos TPL library directories"')
+	@$(call kokkos_append_gmakevar,KOKKOS_TPL_LIBRARY_NAMES,'STRING "Kokkos TPL library names"')
+	@$(call kokkos_appendvar2,CXX,'KOKKOS C++ Compiler')
+	@$(call kokkos_append_cmakefile,"if(NOT DEFINED ENV{NVCC_WRAPPER})")
+	@$(call kokkos_append_var2,NVCC_WRAPPER,'FILEPATH "Path to command nvcc_wrapper"')
+	@$(call kokkos_append_cmakefile,"else()")
+	@$(call kokkos_append_cmakefile,'  set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")')
+	@$(call kokkos_append_cmakefile,"endif()")
+	@$(call kokkos_append_string,"")
+	@$(call kokkos_append_string,"#Source and Header files of Kokkos relative to KOKKOS_PATH")
+	@$(call kokkos_append_var,KOKKOS_HEADERS,'STRING "Kokkos headers list"')
+	@$(call kokkos_append_var,KOKKOS_HEADERS_IMPL,'STRING "Kokkos headers impl list"')
+	@$(call kokkos_append_var,KOKKOS_HEADERS_CUDA,'STRING "Kokkos headers Cuda list"')
+	@$(call kokkos_append_var,KOKKOS_HEADERS_OPENMP,'STRING "Kokkos headers OpenMP list"')
+	@$(call kokkos_append_var,KOKKOS_HEADERS_ROCM,'STRING "Kokkos headers ROCm list"')
+	@$(call kokkos_append_var,KOKKOS_HEADERS_THREADS,'STRING "Kokkos headers Threads list"')
+	@$(call kokkos_append_var,KOKKOS_HEADERS_QTHREADS,'STRING "Kokkos headers QThreads list"')
+	@$(call kokkos_append_var,KOKKOS_SRC,'STRING "Kokkos source list"')
+	@$(call kokkos_append_string,"")
+	@$(call kokkos_append_string,"#Variables used in application Makefiles")
+	@$(call kokkos_append_var,KOKKOS_OS,'STRING ""')  # This was not in original cmake gen
+	@$(call kokkos_append_var,KOKKOS_CPP_DEPENDS,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_LINK_DEPENDS,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_CXXFLAGS,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_CPPFLAGS,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_LDFLAGS,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_LIBS,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_EXTRA_LIBS,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_LINK_FLAGS,'STRING "extra flags to the link step (e.g. OpenMP)"')
+	@$(call kokkos_append_string,"")
+	@$(call kokkos_append_string,"#Internal settings which need to propagated for Kokkos examples")
+	@$(call kokkos_append_var,KOKKOS_INTERNAL_USE_CUDA,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_INTERNAL_USE_OPENMP,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_INTERNAL_USE_PTHREADS,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_INTERNAL_USE_SERIAL,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_INTERNAL_USE_ROCM,'STRING ""')
+	@$(call kokkos_append_var,KOKKOS_INTERNAL_USE_QTHREADS,'STRING ""') # Not in original cmake gen
+	@$(call kokkos_append_cmakefile "mark_as_advanced(KOKKOS_HEADERS KOKKOS_SRC KOKKOS_INTERNAL_USE_CUDA KOKKOS_INTERNAL_USE_OPENMP KOKKOS_INTERNAL_USE_PTHREADS KOKKOS_INTERNAL_USE_SERIAL)")
+	@$(call kokkos_append_makefile,"")
+	@$(call kokkos_append_makefile,"#Fake kokkos-clean target")
+	@$(call kokkos_append_makefile,"kokkos-clean:")
+	@$(call kokkos_append_makefile,"")
+	@sed \
+		-e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \
+		-e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \
+		-e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \
+		-e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \
+		-e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \
+		-e 's|= $(KOKKOS_CONFIG_HEADER)|= $(PREFIX)/include/$(KOKKOS_CONFIG_HEADER)|g' $(KOKKOS_MAKEFILE) \
+		> $(KOKKOS_MAKEFILE).tmp
+	@mv -f $(KOKKOS_MAKEFILE).tmp $(KOKKOS_MAKEFILE)
+	@$(call kokkos_setvar_cmakefile,KOKKOS_CXX_FLAGS,$(KOKKOS_CXXFLAGS))
+	@$(call kokkos_setvar_cmakefile,KOKKOS_CPP_FLAGS,$(KOKKOS_CPPFLAGS))
+	@$(call kokkos_setvar_cmakefile,KOKKOS_LD_FLAGS,$(KOKKOS_LDFLAGS))
+	@$(call kokkos_setlist_cmakefile,KOKKOS_LIBS_LIST,$(KOKKOS_LIBS))
+	@$(call kokkos_setlist_cmakefile,KOKKOS_EXTRA_LIBS_LIST,$(KOKKOS_EXTRA_LIBS))
+	@$(call kokkos_setvar_cmakefile,KOKKOS_LINK_FLAGS,$(KOKKOS_LINK_FLAGS))
+
diff --git a/packages/kokkos/core/src/Makefile.generate_header_lists b/packages/kokkos/core/src/Makefile.generate_header_lists
new file mode 100644
index 0000000000000000000000000000000000000000..cd308bf8f4a88328601105d08cbc37afed54deb9
--- /dev/null
+++ b/packages/kokkos/core/src/Makefile.generate_header_lists
@@ -0,0 +1,28 @@
+# Build a List of Header Files
+
+KOKKOS_HEADERS_INCLUDE       = $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
+KOKKOS_HEADERS_INCLUDE_IMPL  = $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
+KOKKOS_HEADERS_INCLUDE      += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
+KOKKOS_HEADERS_INCLUDE_IMPL += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp)
+KOKKOS_HEADERS_INCLUDE      += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+  KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  KOKKOS_HEADERS_QTHREADS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+  KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+  KOKKOS_HEADERS_ROCM += $(wildcard $(KOKKOS_PATH)/core/src/ROCm/*.hpp)
+endif
+
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b04afe4184dbc57fc3e4bbf5143f4d838452b01
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@@ -0,0 +1,475 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP )
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <limits>
+#include <iostream>
+#include <vector>
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_CPUDiscovery.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+
+namespace Kokkos {
+namespace Impl {
+
+int g_openmp_hardware_max_threads = 1;
+
+__thread int t_openmp_hardware_id = 0;
+__thread Impl::OpenMPExec * t_openmp_instance = nullptr;
+
+void OpenMPExec::validate_partition( const int nthreads
+                                   , int & num_partitions
+                                   , int & partition_size
+                                  )
+{
+  if (nthreads == 1) {
+    num_partitions = 1;
+    partition_size = 1;
+  }
+  else if( num_partitions < 1 && partition_size < 1) {
+    int idle = nthreads;
+    for (int np = 2; np <= nthreads ; ++np) {
+      for (int ps = 1; ps <= nthreads/np; ++ps) {
+        if (nthreads - np*ps < idle) {
+          idle = nthreads - np*ps;
+          num_partitions = np;
+          partition_size = ps;
+        }
+        if (idle == 0) {
+          break;
+        }
+      }
+    }
+  }
+  else if( num_partitions < 1 && partition_size > 0 ) {
+    if ( partition_size <= nthreads ) {
+      num_partitions = nthreads / partition_size;
+    }
+    else {
+      num_partitions = 1;
+      partition_size = nthreads;
+    }
+  }
+  else if( num_partitions > 0 && partition_size < 1 ) {
+    if ( num_partitions <= nthreads ) {
+      partition_size = nthreads / num_partitions;
+    }
+    else {
+      num_partitions = nthreads;
+      partition_size = 1;
+    }
+  }
+  else if ( num_partitions * partition_size > nthreads ) {
+    int idle = nthreads;
+    const int NP = num_partitions;
+    const int PS = partition_size;
+    for (int np = NP; np > 0; --np) {
+      for (int ps = PS; ps > 0; --ps) {
+        if (  (np*ps <= nthreads)
+           && (nthreads - np*ps < idle) ) {
+          idle = nthreads - np*ps;
+          num_partitions = np;
+          partition_size = ps;
+        }
+        if (idle == 0) {
+          break;
+        }
+      }
+    }
+  }
+
+}
+
+void OpenMPExec::verify_is_master( const char * const label )
+{
+  if ( !t_openmp_instance )
+  {
+    std::string msg( label );
+    msg.append( " ERROR: in parallel or not initialized" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void OpenMPExec::clear_thread_data()
+{
+  const size_t member_bytes =
+    sizeof(int64_t) *
+    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );
+
+  const int old_alloc_bytes =
+    m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;
+
+  OpenMP::memory_space space ;
+
+  #pragma omp parallel num_threads( m_pool_size )
+  {
+    const int rank = omp_get_thread_num();
+
+    if ( 0 != m_pool[rank] ) {
+
+      m_pool[rank]->disband_pool();
+
+      space.deallocate( m_pool[rank] , old_alloc_bytes );
+
+      m_pool[rank] = 0 ;
+    }
+  }
+/* END #pragma omp parallel */
+}
+
+void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes )
+{
+  const size_t member_bytes =
+    sizeof(int64_t) *
+    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );
+
+  HostThreadTeamData * root = m_pool[0] ;
+
+  const size_t old_pool_reduce  = root ? root->pool_reduce_bytes() : 0 ;
+  const size_t old_team_reduce  = root ? root->team_reduce_bytes() : 0 ;
+  const size_t old_team_shared  = root ? root->team_shared_bytes() : 0 ;
+  const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ;
+  const size_t old_alloc_bytes  = root ? ( member_bytes + root->scratch_bytes() ) : 0 ;
+
+  // Allocate if any of the old allocation is tool small:
+
+  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
+                        ( old_team_reduce  < team_reduce_bytes ) ||
+                        ( old_team_shared  < team_shared_bytes ) ||
+                        ( old_thread_local < thread_local_bytes );
+
+  if ( allocate ) {
+
+    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
+    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
+    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
+    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
+
+    const size_t alloc_bytes =
+      member_bytes +
+      HostThreadTeamData::scratch_size( pool_reduce_bytes
+                                      , team_reduce_bytes
+                                      , team_shared_bytes
+                                      , thread_local_bytes );
+
+    OpenMP::memory_space space ;
+
+    memory_fence();
+
+    #pragma omp parallel num_threads(m_pool_size)
+    {
+      const int rank = omp_get_thread_num();
+
+      if ( 0 != m_pool[rank] ) {
+
+        m_pool[rank]->disband_pool();
+
+        space.deallocate( m_pool[rank] , old_alloc_bytes );
+      }
+
+      void * const ptr = space.allocate( alloc_bytes );
+
+      m_pool[ rank ] = new( ptr ) HostThreadTeamData();
+
+      m_pool[ rank ]->
+        scratch_assign( ((char *)ptr) + member_bytes
+                      , alloc_bytes
+                      , pool_reduce_bytes
+                      , team_reduce_bytes
+                      , team_shared_bytes
+                      , thread_local_bytes
+                      );
+
+      memory_fence();
+    }
+/* END #pragma omp parallel */
+
+    HostThreadTeamData::organize_pool( m_pool , m_pool_size );
+  }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+int OpenMP::get_current_max_threads() noexcept
+{
+  // Using omp_get_max_threads(); is problematic in conjunction with
+  // Hwloc on Intel (essentially an initial call to the OpenMP runtime
+  // without a parallel region before will set a process mask for a single core
+  // The runtime will than bind threads for a parallel region to other cores on the
+  // entering the first parallel region and make the process mask the aggregate of
+  // the thread masks. The intend seems to be to make serial code run fast, if you
+  // compile with OpenMP enabled but don't actually use parallel regions or so
+  // static int omp_max_threads = omp_get_max_threads();
+
+  int count = 0;
+  #pragma omp parallel
+  {
+    #pragma omp atomic
+     ++count;
+  }
+  return count;
+}
+
+
+void OpenMP::initialize( int thread_count )
+{
+  if ( omp_in_parallel() ) {
+    std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel");
+    Kokkos::Impl::throw_runtime_exception(msg);
+  }
+
+  if ( Impl::t_openmp_instance )
+  {
+    finalize();
+  }
+
+  {
+    if ( Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND") ) {
+      printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n");
+      printf("  In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n");
+      printf("  For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n");
+      printf("  For unit testing set OMP_PROC_BIND=false\n");
+    }
+
+    OpenMP::memory_space space ;
+
+    // Before any other call to OMP query the maximum number of threads
+    // and save the value for re-initialization unit testing.
+
+    Impl::g_openmp_hardware_max_threads = get_current_max_threads();
+
+    int process_num_threads = Impl::g_openmp_hardware_max_threads;
+
+    if ( Kokkos::hwloc::available() ) {
+      process_num_threads = Kokkos::hwloc::get_available_numa_count()
+                          * Kokkos::hwloc::get_available_cores_per_numa()
+                          * Kokkos::hwloc::get_available_threads_per_core();
+    }
+
+    // if thread_count  < 0, use g_openmp_hardware_max_threads;
+    // if thread_count == 0, set g_openmp_hardware_max_threads to process_num_threads
+    // if thread_count  > 0, set g_openmp_hardware_max_threads to thread_count
+    if (thread_count < 0 ) {
+      thread_count = Impl::g_openmp_hardware_max_threads;
+    }
+    else if( thread_count == 0 && Impl::g_openmp_hardware_max_threads != process_num_threads ) {
+      Impl::g_openmp_hardware_max_threads = process_num_threads;
+      omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
+    }
+    else {
+      if( Kokkos::show_warnings() && thread_count > process_num_threads ) {
+        printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n");
+        printf( "  process threads available : %3d,  requested thread : %3d\n", process_num_threads, thread_count );
+      }
+      Impl::g_openmp_hardware_max_threads = thread_count;
+      omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
+    }
+
+    // setup thread local
+    #pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads)
+    {
+      Impl::t_openmp_instance = nullptr;
+      Impl::t_openmp_hardware_id = omp_get_thread_num();
+      Impl::SharedAllocationRecord< void, void >::tracking_enable();
+    }
+
+    void * const ptr = space.allocate( sizeof(Impl::OpenMPExec) );
+
+    Impl::t_openmp_instance = new (ptr) Impl::OpenMPExec( Impl::g_openmp_hardware_max_threads );
+
+    // New, unified host thread team data:
+    {
+      size_t pool_reduce_bytes  =   32 * thread_count ;
+      size_t team_reduce_bytes  =   32 * thread_count ;
+      size_t team_shared_bytes  = 1024 * thread_count ;
+      size_t thread_local_bytes = 1024 ;
+
+      Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
+                                                 , team_reduce_bytes
+                                                 , team_shared_bytes
+                                                 , thread_local_bytes
+                                                 );
+    }
+  }
+
+
+  // Check for over-subscription
+  if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
+    std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
+    std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
+    std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
+    std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
+  }
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMP::finalize()
+{
+  if ( omp_in_parallel() )
+  {
+    std::string msg("Kokkos::OpenMP::finalize ERROR ");
+    if( !Impl::t_openmp_instance ) msg.append(": not initialized");
+    if( omp_in_parallel() ) msg.append(": in parallel");
+    Kokkos::Impl::throw_runtime_exception(msg);
+  }
+
+  if ( Impl::t_openmp_instance ) {
+    // Silence Cuda Warning
+    const int nthreads = Impl::t_openmp_instance->m_pool_size <= Impl::g_openmp_hardware_max_threads
+                       ? Impl::g_openmp_hardware_max_threads
+                       : Impl::t_openmp_instance->m_pool_size;
+    (void) nthreads;
+
+    using Exec = Impl::OpenMPExec;
+    Exec * instance = Impl::t_openmp_instance;
+    instance->~Exec();
+
+    OpenMP::memory_space space;
+    space.deallocate( instance, sizeof(Exec) );
+
+    #pragma omp parallel num_threads(nthreads)
+    {
+      Impl::t_openmp_hardware_id = 0;
+      Impl::t_openmp_instance    = nullptr;
+      Impl::SharedAllocationRecord< void, void >::tracking_disable();
+    }
+
+    // allow main thread to track
+    Impl::SharedAllocationRecord< void, void >::tracking_enable();
+
+    Impl::g_openmp_hardware_max_threads = 1;
+  }
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMP::print_configuration( std::ostream & s , const bool verbose )
+{
+  s << "Kokkos::OpenMP" ;
+
+  const bool is_initialized =  Impl::t_openmp_instance != nullptr;
+
+  if ( is_initialized ) {
+    Impl::OpenMPExec::verify_is_master( "OpenMP::print_configuration" );
+
+    const int numa_count      = 1;
+    const int core_per_numa   = Impl::g_openmp_hardware_max_threads;
+    const int thread_per_core = 1;
+
+    s << " thread_pool_topology[ " << numa_count
+      << " x " << core_per_numa
+      << " x " << thread_per_core
+      << " ]"
+      << std::endl ;
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+}
+
+std::vector<OpenMP> OpenMP::partition(...)
+{ return std::vector<OpenMP>(1); }
+
+OpenMP OpenMP::create_instance(...) { return OpenMP(); }
+
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+
+int OpenMP::concurrency() {
+  return Impl::g_openmp_hardware_max_threads;
+}
+
+void OpenMP::initialize( int thread_count , int, int )
+{
+  initialize(thread_count);
+}
+
+#endif
+
+} // namespace Kokkos
+
+#else
+void KOKKOS_CORE_SRC_OPENMP_EXEC_PREVENT_LINK_ERROR() {}
+#endif //KOKKOS_ENABLE_OPENMP
+
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..870c84d56cc9ed883855b76bfe7ff35f37b55991
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@@ -0,0 +1,367 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPEXEC_HPP
+#define KOKKOS_OPENMPEXEC_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP )
+
+#if !defined(_OPENMP) && !defined(__CUDA_ARCH__)
+#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
+#endif
+
+#include <Kokkos_OpenMP.hpp>
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+#include <Kokkos_UniqueToken.hpp>
+
+#include <iostream>
+#include <sstream>
+#include <fstream>
+
+#include <omp.h>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Impl {
+
+class OpenMPExec;
+
+extern int g_openmp_hardware_max_threads;
+
+extern __thread int t_openmp_hardware_id;
+extern __thread OpenMPExec * t_openmp_instance;
+
+//----------------------------------------------------------------------------
+/** \brief  Data for OpenMP thread execution */
+
+class OpenMPExec {
+public:
+
+  friend class Kokkos::OpenMP ;
+
+  enum { MAX_THREAD_COUNT = 512 };
+
+  void clear_thread_data();
+
+  static void validate_partition( const int nthreads
+                                , int & num_partitions
+                                , int & partition_size
+                                );
+
+private:
+  OpenMPExec( int arg_pool_size )
+    : m_pool_size{ arg_pool_size }
+    , m_level{ omp_get_level() }
+    , m_pool()
+  {}
+
+  ~OpenMPExec()
+  {
+    clear_thread_data();
+  }
+
+  int m_pool_size;
+  int m_level;
+
+  HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
+
+public:
+
+  static void verify_is_master( const char * const );
+
+  void resize_thread_data( size_t pool_reduce_bytes
+                         , size_t team_reduce_bytes
+                         , size_t team_shared_bytes
+                         , size_t thread_local_bytes );
+
+  inline
+  HostThreadTeamData * get_thread_data() const noexcept
+  { return m_pool[ m_level == omp_get_level() ? 0 : omp_get_thread_num() ]; }
+
+  inline
+  HostThreadTeamData * get_thread_data( int i ) const noexcept
+  { return m_pool[i]; }
+};
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline OpenMP::OpenMP() noexcept
+{}
+
+inline
+bool OpenMP::is_initialized() noexcept
+{ return Impl::t_openmp_instance != nullptr; }
+
+inline
+bool OpenMP::in_parallel( OpenMP const& ) noexcept
+{
+  //t_openmp_instance is only non-null on a master thread
+  return   !Impl::t_openmp_instance
+         || Impl::t_openmp_instance->m_level < omp_get_level()
+         ;
+}
+
+inline
+int OpenMP::thread_pool_size() noexcept
+{
+  return   OpenMP::in_parallel()
+         ? omp_get_num_threads()
+         : Impl::t_openmp_instance->m_pool_size
+         ;
+}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMP::thread_pool_rank() noexcept
+{
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  return Impl::t_openmp_instance ? 0 : omp_get_thread_num();
+#else
+  return -1 ;
+#endif
+}
+
+inline
+void OpenMP::fence( OpenMP const& instance ) noexcept {}
+
+inline
+bool OpenMP::is_asynchronous( OpenMP const& instance ) noexcept
+{ return false; }
+
+template <typename F>
+void OpenMP::partition_master( F const& f
+                             , int num_partitions
+                             , int partition_size
+                             )
+{
+  if (omp_get_nested()) {
+    using Exec = Impl::OpenMPExec;
+
+    Exec * prev_instance = Impl::t_openmp_instance;
+
+    Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size );
+
+    OpenMP::memory_space space;
+
+    #pragma omp parallel num_threads(num_partitions)
+    {
+      void * const ptr = space.allocate( sizeof(Exec) );
+
+      Impl::t_openmp_instance = new (ptr) Exec( partition_size );
+
+      size_t pool_reduce_bytes  =   32 * partition_size ;
+      size_t team_reduce_bytes  =   32 * partition_size ;
+      size_t team_shared_bytes  = 1024 * partition_size ;
+      size_t thread_local_bytes = 1024 ;
+
+      Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
+                                                 , team_reduce_bytes
+                                                 , team_shared_bytes
+                                                 , thread_local_bytes
+                                                 );
+
+      omp_set_num_threads(partition_size);
+      f( omp_get_thread_num(), omp_get_num_threads() );
+
+      Impl::t_openmp_instance->~Exec();
+      space.deallocate( Impl::t_openmp_instance, sizeof(Exec) );
+      Impl::t_openmp_instance = nullptr;
+    }
+
+    Impl::t_openmp_instance  = prev_instance;
+  }
+  else {
+    // nested openmp not enabled
+    f(0,1);
+  }
+}
+
+
+namespace Experimental {
+
+template<>
+class MasterLock<OpenMP>
+{
+public:
+  void lock()     { omp_set_lock( &m_lock );   }
+  void unlock()   { omp_unset_lock( &m_lock ); }
+  bool try_lock() { return static_cast<bool>(omp_test_lock( &m_lock )); }
+
+  MasterLock()  { omp_init_lock( &m_lock ); }
+  ~MasterLock() { omp_destroy_lock( &m_lock ); }
+
+  MasterLock( MasterLock const& ) = delete;
+  MasterLock( MasterLock && )     = delete;
+  MasterLock & operator=( MasterLock const& ) = delete;
+  MasterLock & operator=( MasterLock && )     = delete;
+
+private:
+  omp_lock_t m_lock;
+
+};
+
+template<>
+class UniqueToken< OpenMP, UniqueTokenScope::Instance>
+{
+public:
+  using execution_space = OpenMP;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::OpenMP::thread_pool_size();
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const  noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::OpenMP::thread_pool_rank();
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release( int ) const noexcept {}
+};
+
+template<>
+class UniqueToken< OpenMP, UniqueTokenScope::Global>
+{
+public:
+  using execution_space = OpenMP;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::Impl::g_openmp_hardware_max_threads ;
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::Impl::t_openmp_hardware_id ;
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release( int ) const noexcept {}
+};
+
+} // namespace Experimental
+
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+
+inline
+int OpenMP::thread_pool_size( int depth )
+{
+  return depth < 2
+         ? thread_pool_size()
+         : 1;
+}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMP::hardware_thread_id() noexcept
+{
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  return Impl::t_openmp_hardware_id;
+#else
+  return -1 ;
+#endif
+}
+
+inline
+int OpenMP::max_hardware_threads() noexcept
+{
+  return Impl::g_openmp_hardware_max_threads;
+}
+
+#endif
+
+} // namespace Kokkos
+
+#endif
+#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
+
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..21416108a3f0543f910859aa747304b4cf508a76
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@@ -0,0 +1,1132 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_PARALLEL_HPP
+#define KOKKOS_OPENMP_PARALLEL_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP )
+
+#include <omp.h>
+#include <iostream>
+#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Traits ... >
+                 , Kokkos::OpenMP
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ...  > Policy ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+        OpenMPExec   * m_instance ;
+  const FunctorType    m_functor ;
+  const Policy         m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      #ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( iwork );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      const TagType t{} ;
+      #ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( t , iwork );
+      }
+    }
+
+public:
+
+  inline void execute() const
+  {
+    enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+         , Kokkos::Dynamic >::value
+         };
+
+    if ( OpenMP::in_parallel() ) {
+      exec_range< WorkTag >( m_functor
+                           , m_policy.begin()
+                           , m_policy.end() );
+    }
+    else {
+
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
+      {
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
+
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+            , m_policy.chunk_size() );
+
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        std::pair<int64_t,int64_t> range(0,0);
+
+        do {
+
+          range = is_dynamic ? data.get_work_stealing_chunk()
+            : data.get_work_partition();
+
+          ParallelFor::template
+            exec_range< WorkTag >( m_functor
+                , range.first  + m_policy.begin()
+                , range.second + m_policy.begin() );
+
+        } while ( is_dynamic && 0 <= range.first );
+      }
+    }
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , Policy arg_policy )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    {}
+};
+
+
+// MDRangePolicy impl
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::MDRangePolicy< Traits ... >
+                 , Kokkos::OpenMP
+                 >
+{
+private:
+
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy         Policy ;
+  typedef typename MDRangePolicy::work_tag                  WorkTag ;
+
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+
+        OpenMPExec   * m_instance ;
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;  // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy 
+            , const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      #ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        iterate_type( mdr_policy, functor )( iwork );
+      }
+    }
+
+public:
+
+  inline void execute() const
+  {
+      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+                                      , Kokkos::Dynamic >::value };
+
+    if ( OpenMP::in_parallel() ) {
+      ParallelFor::exec_range ( m_mdr_policy
+                              , m_functor
+                              , m_policy.begin()
+                              , m_policy.end() );
+    }
+    else {
+
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
+      {
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
+
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+                               , m_policy.chunk_size() );
+
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        std::pair<int64_t,int64_t> range(0,0);
+
+        do {
+
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();
+
+          ParallelFor::exec_range( m_mdr_policy 
+                                 , m_functor
+                                 , range.first  + m_policy.begin()
+                                 , range.second + m_policy.begin() );
+
+        } while ( is_dynamic && 0 <= range.first );
+      }
+      // END #pragma omp parallel
+    }
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , MDRangePolicy arg_policy )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ...>
+                    , ReducerType
+                    , Kokkos::OpenMP
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  // Static Assert WorkTag void if ReducerType not InvalidType
+
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTagFwd > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTagFwd > ValueJoin ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;
+  const ReducerType    m_reducer;
+  const pointer_type   m_result_ptr;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update )
+    {
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( iwork , update );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update )
+    {
+      const TagType t{} ;
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( t , iwork , update );
+      }
+    }
+
+public:
+
+  inline void execute() const
+    {
+      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+                                      , Kokkos::Dynamic >::value };
+
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
+
+      const size_t pool_reduce_bytes =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
+
+      m_instance->resize_thread_data( pool_reduce_bytes
+                                    , 0 // team_reduce_bytes
+                                    , 0 // team_shared_bytes
+                                    , 0 // thread_local_bytes
+                                    );
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
+      {
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
+
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+                               , m_policy.chunk_size() );
+
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        reference_type update =
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                         , data.pool_reduce_local() );
+
+        std::pair<int64_t,int64_t> range(0,0);
+
+        do {
+
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();
+
+          ParallelReduce::template
+            exec_range< WorkTag >( m_functor
+                                 , range.first  + m_policy.begin()
+                                 , range.second + m_policy.begin()
+                                 , update );
+
+        } while ( is_dynamic && 0 <= range.first );
+      }
+
+      // Reduction:
+
+      const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
+
+      for ( int i = 1 ; i < pool_size ; ++i ) {
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
+                       , ptr
+                       , m_instance->get_thread_data(i)->pool_reduce_local() );
+      }
+
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      if ( m_result_ptr ) {
+        const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
+
+        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
+      }
+    }
+
+  //----------------------------------------
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ViewType    & arg_view
+                , typename std::enable_if<
+                           Kokkos::is_view< ViewType >::value &&
+                           !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr(  arg_view.data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
+
+// MDRangePolicy impl
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::MDRangePolicy< Traits ...>
+                    , ReducerType
+                    , Kokkos::OpenMP
+                    >
+{
+private:
+
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy         Policy ;
+
+  typedef typename MDRangePolicy::work_tag                  WorkTag ;
+  typedef typename Policy::WorkRange                        WorkRange ;
+  typedef typename Policy::member_type                      Member ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , MDRangePolicy , FunctorType > Analysis ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTagFwd > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTagFwd > ValueJoin ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::value_type      value_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
+                                                             , FunctorType
+                                                             , WorkTag
+                                                             , reference_type
+                                                             >;
+
+        OpenMPExec   * m_instance ;
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;     // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy
+            , const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update )
+    {
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        iterate_type( mdr_policy, functor, update )( iwork );
+      }
+    }
+
+public:
+
+  inline void execute() const
+    {
+      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+                                      , Kokkos::Dynamic >::value };
+
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
+
+      const size_t pool_reduce_bytes =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
+
+      m_instance->resize_thread_data( pool_reduce_bytes
+                                    , 0 // team_reduce_bytes
+                                    , 0 // team_shared_bytes
+                                    , 0 // thread_local_bytes
+                                    );
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
+      {
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
+
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+                               , m_policy.chunk_size() );
+
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        reference_type update =
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                         , data.pool_reduce_local() );
+
+        std::pair<int64_t,int64_t> range(0,0);
+
+        do {
+
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();
+
+          ParallelReduce::exec_range ( m_mdr_policy, m_functor
+                                     , range.first  + m_policy.begin()
+                                     , range.second + m_policy.begin()
+                                     , update );
+
+        } while ( is_dynamic && 0 <= range.first );
+      }
+// END #pragma omp parallel
+
+      // Reduction:
+
+      const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
+
+      for ( int i = 1 ; i < pool_size ; ++i ) {
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
+                       , ptr
+                       , m_instance->get_thread_data(i)->pool_reduce_local() );
+      }
+
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      if ( m_result_ptr ) {
+        const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
+
+        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
+      }
+    }
+
+  //----------------------------------------
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ViewType    & arg_view
+                , typename std::enable_if<
+                           Kokkos::is_view< ViewType >::value &&
+                           !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( InvalidType() )
+    , m_result_ptr(  arg_view.data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ReducerType& reducer )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::OpenMP
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType, WorkTag > ValueJoin ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType, WorkTag > ValueOps ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( iwork , update , final );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      const TagType t{} ;
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( t , iwork , update , final );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_scan");
+
+      const int    value_count       = Analysis::value_count( m_functor );
+      const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
+
+      m_instance->resize_thread_data( pool_reduce_bytes
+                                    , 0 // team_reduce_bytes
+                                    , 0 // team_shared_bytes
+                                    , 0 // thread_local_bytes
+                                    );
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
+      {
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
+
+        const WorkRange range( m_policy, omp_get_thread_num(), omp_get_num_threads() );
+
+        reference_type update_sum =
+          ValueInit::init( m_functor , data.pool_reduce_local() );
+
+        ParallelScan::template exec_range< WorkTag >
+          ( m_functor , range.begin() , range.end() , update_sum , false );
+
+        if ( data.pool_rendezvous() ) {
+
+          pointer_type ptr_prev = 0 ;
+
+          const int n = omp_get_num_threads();
+
+          for ( int i = 0 ; i < n ; ++i ) {
+
+            pointer_type ptr = (pointer_type)
+              data.pool_member(i)->pool_reduce_local();
+
+            if ( i ) {
+              for ( int j = 0 ; j < value_count ; ++j ) {
+                ptr[j+value_count] = ptr_prev[j+value_count] ;
+              }
+              ValueJoin::join( m_functor , ptr + value_count , ptr_prev );
+            }
+            else {
+              ValueInit::init( m_functor , ptr + value_count );
+            }
+
+            ptr_prev = ptr ;
+          }
+
+          data.pool_rendezvous_release();
+        }
+
+        reference_type update_base =
+          ValueOps::reference
+            ( ((pointer_type)data.pool_reduce_local()) + value_count );
+
+        ParallelScan::template exec_range< WorkTag >
+          ( m_functor , range.begin() , range.end() , update_base , true );
+      }
+
+    }
+
+  //----------------------------------------
+
+  inline
+  ParallelScan( const FunctorType & arg_functor
+              , const Policy      & arg_policy )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_policy(  arg_policy )
+  {}
+
+  //----------------------------------------
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , Kokkos::TeamPolicy< Properties ... >
+                 , Kokkos::OpenMP
+                 >
+{
+private:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... > Policy ;
+  typedef typename Policy::work_tag             WorkTag ;
+  typedef typename Policy::schedule_type::type  SchedTag ;
+  typedef typename Policy::member_type          Member ;
+
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;
+  const int            m_shmem_size;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
+    {
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( Member( data, r , league_size ) );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
+      }
+    }
+
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
+    {
+      const TagType t{};
+
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( t , Member( data, r , league_size ) );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
+
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
+
+      const size_t pool_reduce_size = 0 ; // Never shrinks
+      const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
+      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      m_instance->resize_thread_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
+      {
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
+
+        const int active = data.organize_team( m_policy.team_size() );
+
+        if ( active ) {
+          data.set_work_partition( m_policy.league_size()
+                                 , ( 0 < m_policy.chunk_size()
+                                   ? m_policy.chunk_size()
+                                   : m_policy.team_iter() ) );
+        }
+
+        if ( is_dynamic ) {
+          // Must synchronize to make sure each team has set its
+          // partition before begining the work stealing loop.
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        if ( active ) {
+
+          std::pair<int64_t,int64_t> range(0,0);
+
+          do {
+
+            range = is_dynamic ? data.get_work_stealing_chunk()
+                               : data.get_work_partition();
+
+            ParallelFor::template exec_team< WorkTag >
+              ( m_functor , data
+              , range.first , range.second , m_policy.league_size() );
+
+          } while ( is_dynamic && 0 <= range.first );
+        }
+
+        data.disband_team();
+      }
+    }
+
+
+  inline
+  ParallelFor( const FunctorType & arg_functor ,
+               const Policy      & arg_policy )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_shmem_size( arg_policy.scratch_size(0) +
+                    arg_policy.scratch_size(1) +
+                    FunctorTeamShmemSize< FunctorType >
+                      ::value( arg_functor , arg_policy.team_size() ) )
+    {}
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ReducerType, class ... Properties >
+class ParallelReduce< FunctorType
+                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
+                    , Kokkos::OpenMP
+                    >
+{
+private:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... >         Policy ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
+  typedef typename Policy::work_tag             WorkTag ;
+  typedef typename Policy::schedule_type::type  SchedTag ;
+  typedef typename Policy::member_type          Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value
+                            , FunctorType, ReducerType> ReducerConditional;
+
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTagFwd >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd , WorkTagFwd >  ValueJoin ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;
+  const ReducerType    m_reducer;
+  const pointer_type   m_result_ptr;
+  const int            m_shmem_size;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , reference_type     & update
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
+    {
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( Member( data, r , league_size ) , update );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
+      }
+    }
+
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , reference_type     & update
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
+    {
+      const TagType t{};
+
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( t , Member( data, r , league_size ) , update );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
+
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
+
+      const size_t pool_reduce_size =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
+
+      const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
+      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      m_instance->resize_thread_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
+      {
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
+
+        const int active = data.organize_team( m_policy.team_size() );
+
+        if ( active ) {
+          data.set_work_partition( m_policy.league_size()
+                                 , ( 0 < m_policy.chunk_size()
+                                   ? m_policy.chunk_size()
+                                   : m_policy.team_iter() ) );
+        }
+
+        if ( is_dynamic ) {
+          // Must synchronize to make sure each team has set its
+          // partition before begining the work stealing loop.
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        if ( active ) {
+          reference_type update =
+            ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                           , data.pool_reduce_local() );
+
+          std::pair<int64_t,int64_t> range(0,0);
+
+          do {
+
+            range = is_dynamic ? data.get_work_stealing_chunk()
+                               : data.get_work_partition();
+
+            ParallelReduce::template exec_team< WorkTag >
+              ( m_functor , data , update
+              , range.first , range.second , m_policy.league_size() );
+
+          } while ( is_dynamic && 0 <= range.first );
+        } else {
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                           , data.pool_reduce_local() );
+        }
+
+        data.disband_team();
+
+        //  This thread has updated 'pool_reduce_local()' with its
+        //  contributions to the reduction.  The parallel region is
+        //  about to terminate and the master thread will load and
+        //  reduce each 'pool_reduce_local()' contribution.
+        //  Must 'memory_fence()' to guarantee that storing the update to
+        //  'pool_reduce_local()' will complete before this thread
+        //  exits the parallel region.
+
+        memory_fence();
+      }
+
+      // Reduction:
+
+      const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
+
+      for ( int i = 1 ; i < pool_size ; ++i ) {
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
+                       , ptr
+                       , m_instance->get_thread_data(i)->pool_reduce_local() );
+      }
+
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      if ( m_result_ptr ) {
+        const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
+
+        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
+      }
+    }
+
+  //----------------------------------------
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const ViewType     & arg_result ,
+                  typename std::enable_if<
+                    Kokkos::is_view< ViewType >::value &&
+                    !Kokkos::is_reducer_type<ReducerType>::value
+                    ,void*>::type = NULL)
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result.ptr_on_device() )
+    , m_shmem_size( arg_policy.scratch_size(0) +
+                    arg_policy.scratch_size(1) +
+                    FunctorTeamShmemSize< FunctorType >
+                      ::value( arg_functor , arg_policy.team_size() ) )
+    {}
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_instance( t_openmp_instance )
+  , m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.view().data() )
+  , m_shmem_size( arg_policy.scratch_size(0) +
+                  arg_policy.scratch_size(1) +
+                  FunctorTeamShmemSize< FunctorType >
+                    ::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+#endif /* KOKKOS_OPENMP_PARALLEL_HPP */
+
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ebeec44a5ec04ed291d757951dbc9ed81473ec8
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@@ -0,0 +1,255 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::OpenMP > ;
+
+class HostThreadTeamDataSingleton : private HostThreadTeamData {
+private:
+
+  HostThreadTeamDataSingleton() : HostThreadTeamData()
+    {
+      Kokkos::OpenMP::memory_space space ;
+      const size_t num_pool_reduce_bytes  =   32 ;
+      const size_t num_team_reduce_bytes  =   32 ;
+      const size_t num_team_shared_bytes  = 1024 ;
+      const size_t num_thread_local_bytes = 1024 ;
+      const size_t alloc_bytes =
+        HostThreadTeamData::scratch_size( num_pool_reduce_bytes
+                                        , num_team_reduce_bytes
+                                        , num_team_shared_bytes
+                                        , num_thread_local_bytes );
+
+      HostThreadTeamData::scratch_assign
+        ( space.allocate( alloc_bytes )
+        , alloc_bytes
+        , num_pool_reduce_bytes
+        , num_team_reduce_bytes
+        , num_team_shared_bytes
+        , num_thread_local_bytes );
+    }
+
+  ~HostThreadTeamDataSingleton()
+    {
+      Kokkos::OpenMP::memory_space space ;
+      space.deallocate( HostThreadTeamData::scratch_buffer()
+                      , HostThreadTeamData::scratch_bytes() );
+    }
+
+public:
+
+  static HostThreadTeamData & singleton()
+    {
+      static HostThreadTeamDataSingleton s ;
+      return s ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+void TaskQueueSpecialization< Kokkos::OpenMP >::execute
+  ( TaskQueue< Kokkos::OpenMP > * const queue )
+{
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< void , void , void > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
+
+  static task_root_type * const end =
+    (task_root_type *) task_root_type::EndTag ;
+
+
+  HostThreadTeamData & team_data_single =
+    HostThreadTeamDataSingleton::singleton();
+
+  Impl::OpenMPExec * instance = t_openmp_instance;
+  const int pool_size = OpenMP::thread_pool_size();
+
+  const int team_size = 1;  // Threads per core
+  instance->resize_thread_data( 0 /* global reduce buffer */
+                              , 512 * team_size /* team reduce buffer */
+                              , 0 /* team shared buffer */
+                              , 0 /* thread local buffer */
+                              );
+
+  #pragma omp parallel num_threads(pool_size)
+  {
+    Impl::HostThreadTeamData & self = *(instance->get_thread_data());
+
+    // Organizing threads into a team performs a barrier across the
+    // entire pool to insure proper initialization of the team
+    // rendezvous mechanism before a team rendezvous can be performed.
+
+    if ( self.organize_team( team_size ) ) {
+
+      Member single_exec( team_data_single );
+      Member team_exec( self );
+
+      // Loop until all queues are empty and no tasks in flight
+
+      task_root_type * task = 0 ;
+
+      do {
+        // Each team lead attempts to acquire either a thread team task
+        // or a single thread task for the team.
+
+        if ( 0 == team_exec.team_rank() ) {
+
+          bool leader_loop = false ;
+
+          do {
+
+            if ( 0 != task && end != task ) {
+              // team member #0 completes the previously executed task,
+              // completion may delete the task
+              queue->complete( task );
+            }
+
+            // If 0 == m_ready_count then set task = 0
+
+            task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+            // Attempt to acquire a task
+            // Loop by priority and then type
+            for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+              for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+                task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
+              }
+            }
+
+            // If still tasks are still executing
+            // and no task could be acquired
+            // then continue this leader loop
+            leader_loop = end == task ;
+
+            if ( ( ! leader_loop ) &&
+                 ( 0 != task ) &&
+                 ( task_root_type::TaskSingle == task->m_task_type ) ) {
+
+              // if a single thread task then execute now
+
+              (*task->m_apply)( task , & single_exec );
+
+              leader_loop = true ;
+            }
+          } while ( leader_loop );
+        }
+
+        // Team lead either found 0 == m_ready_count or a team task
+        // Team lead broadcast acquired task:
+
+        team_exec.team_broadcast( task , 0);
+
+        if ( 0 != task ) { // Thread Team Task
+
+          (*task->m_apply)( task , & team_exec );
+
+          // The m_apply function performs a barrier
+        }
+      } while( 0 != task );
+    }
+    self.disband_team();
+  }
+}
+
+void TaskQueueSpecialization< Kokkos::OpenMP >::
+  iff_single_thread_recursive_execute
+    ( TaskQueue< Kokkos::OpenMP > * const queue )
+{
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< void , void , void > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
+
+  if ( 1 == OpenMP::thread_pool_size() ) {
+
+    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+    HostThreadTeamData & team_data_single =
+      HostThreadTeamDataSingleton::singleton();
+
+    Member single_exec( team_data_single );
+
+    task_root_type * task = end ;
+
+    do {
+
+      task = end ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+          task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
+        }
+      }
+
+      if ( end == task ) break ;
+
+      (*task->m_apply)( task , & single_exec );
+
+      queue->complete( task );
+
+    } while(1);
+  }
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+#else
+void KOKKOS_CORE_SRC_OPENMP_KOKKOS_OPENMP_TASK_PREVENT_LINK_ERROR() {}
+#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG ) */
+
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b99c149b06d7acf9d37c3dbc42ad12aafd62e893
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -0,0 +1,90 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
+#define KOKKOS_IMPL_OPENMP_TASK_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueueSpecialization< Kokkos::OpenMP >
+{
+public:
+
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< void , void , void > ;
+  using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  template< typename TaskType >
+  static
+  typename TaskType::function_type
+  get_function_pointer() { return TaskType::apply ; }
+};
+
+extern template class TaskQueue< Kokkos::OpenMP > ;
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
+#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
+
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..775043b23efc1bdfdb2f04c7d3f697dc0027929a
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
@@ -0,0 +1,271 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_TEAM_HPP
+#define KOKKOS_OPENMP_TEAM_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP )
+
+#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
+
+namespace Kokkos { namespace Impl {
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
+{
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_size = p.m_team_size;
+    m_team_alloc = p.m_team_alloc;
+    m_team_iter = p.m_team_iter;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & ) {
+      int pool_size = traits::execution_space::thread_pool_size(1);
+      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      return pool_size<max_host_team_size?pool_size:max_host_team_size;
+    }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & )
+    { return traits::execution_space::thread_pool_size(2); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& )
+    { return traits::execution_space::thread_pool_size(2); }
+
+  //----------------------------------------
+
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+  int m_team_iter ;
+
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+
+  int m_chunk_size;
+
+  inline void init( const int league_size_request
+                  , const int team_size_request )
+    {
+      const int pool_size  = traits::execution_space::thread_pool_size(0);
+      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
+      const int team_grain = traits::execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+
+      // Maxumum number of iterations each team will take:
+      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
+
+      set_auto_chunk_size();
+    }
+
+public:
+
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+
+  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
+    if(team_size_ < 0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , int team_size_request
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request ); }
+
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1)
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
+
+  TeamPolicyInternal( int league_size_request
+            , int team_size_request
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request ); }
+
+  TeamPolicyInternal( int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
+
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int team_iter()  const { return m_team_iter ; }
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+protected:
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal internal_set_chunk_size(typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerThreadValue& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) {
+    m_team_scratch_size[level] = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+
+private:
+  /** \brief finalize chunk_size if it was set to AUTO*/
+  inline void set_auto_chunk_size() {
+
+    int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
+    if( concurrency==0 ) concurrency=1;
+
+    if(m_chunk_size > 0) {
+      if(!Impl::is_integral_power_of_two( m_chunk_size ))
+        Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
+    }
+
+    int new_chunk_size = 1;
+    while(new_chunk_size*100*concurrency < m_league_size)
+      new_chunk_size *= 2;
+    if(new_chunk_size < 128) {
+      new_chunk_size = 1;
+      while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
+        new_chunk_size*=2;
+    }
+    m_chunk_size = new_chunk_size;
+  }
+
+public:
+  typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
+};
+
+}} // namespace Kokkos::Impl
+
+#endif
+#endif /* KOKKOS_OPENMP_TEAM_HPP */
+
+
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd95a54b364eb2040d597964d95151772c500196
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
@@ -0,0 +1,107 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
+#define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::OpenMP
+                 >
+{
+private:
+
+  typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
+
+  Policy       m_policy ;
+  FunctorType  m_functor ;
+
+  template< class TagType >
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }
+
+  template< class TagType >
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{} ; m_functor( t , w ); }
+
+public:
+
+  inline
+  void execute()
+  {
+    const int pool_size = OpenMP::thread_pool_size();
+
+    #pragma omp parallel num_threads(pool_size)
+    {
+      // Spin until COMPLETED_TOKEN.
+      // END_TOKEN indicates no work is currently available.
+
+      for ( std::int32_t w = Policy::END_TOKEN ;
+            Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+        if ( Policy::END_TOKEN != w ) {
+          exec_one< typename Policy::work_tag >( w );
+          m_policy.completed_work(w);
+        }
+      }
+    }
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+  {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */
+
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79f2e185475c2991634693248effde14f5d0790d
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
@@ -0,0 +1,306 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <algorithm>
+#include <omp.h>
+#include <Kokkos_Macros.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include <Kokkos_OpenMPTargetSpace.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+
+namespace Kokkos {
+namespace Experimental {
+/* Default allocation mechanism */
+OpenMPTargetSpace::OpenMPTargetSpace()
+{}
+
+void * OpenMPTargetSpace::allocate( const size_t arg_alloc_size ) const
+{
+  static_assert( sizeof(void*) == sizeof(uintptr_t)
+               , "Error sizeof(void*) != sizeof(uintptr_t)" );
+
+  void * ptr;
+  
+  ptr = omp_target_alloc( arg_alloc_size, omp_get_default_device());
+
+  return ptr;
+}
+
+
+void OpenMPTargetSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
+{
+  if ( arg_alloc_ptr ) {
+
+      omp_target_free( arg_alloc_ptr , omp_get_default_device() );
+
+  }
+}
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::s_root_record ;
+
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+//TODO: Implement deep copy back see CudaSpace 
+std::string
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_label() const {
+  return std::string("OpenMPTargetAllocation");
+}
+
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >* 
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+allocate( const Kokkos::Experimental::OpenMPTargetSpace & arg_space,
+          const std::string               & arg_label ,
+          const size_t                      arg_alloc_size) {
+  return new SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >(arg_space,arg_label,arg_alloc_size);
+}
+
+void
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+deallocate( SharedAllocationRecord<void, void> * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  SharedAllocationHeader header;
+
+  header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( header.m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+  
+  //TODO DeepCopy
+  // DeepCopy
+
+}
+
+//----------------------------------------------------------------------------
+
+void * SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+allocate_tracked( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
+                , const std::string & arg_alloc_label 
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  //Kokkos::Impl::DeepCopy<OpenMPTargetSpace,OpenMPTargetSpace>( r_new->data() , r_old->data()
+  //                                           , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_record( void * alloc_ptr )
+{
+  typedef SharedAllocationHeader  Header ;
+  typedef SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >  RecordHost ;
+
+  SharedAllocationHeader const * const head   = alloc_ptr ? Header::get_header( alloc_ptr ) : (SharedAllocationHeader *)0 ;
+  RecordHost                   * const record = head ? static_cast< RecordHost * >( head->m_record ) : (RecordHost *) 0 ;
+
+  if ( ! alloc_ptr || record->m_alloc_ptr != head ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_record ERROR" ) );
+  }
+
+  return record ;
+}
+
+// Iterate records to print orphaned memory ...
+void SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
+print_records( std::ostream & s , const Kokkos::Experimental::OpenMPTargetSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "OpenMPTargetSpace" , & s_root_record , detail );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template< class >
+struct ViewOperatorBoundsErrorAbort ;
+
+template<>
+struct ViewOperatorBoundsErrorAbort< Kokkos::Experimental::OpenMPTargetSpace > {
+ static void apply( const size_t rank
+                  , const size_t n0 , const size_t n1
+                  , const size_t n2 , const size_t n3
+                  , const size_t n4 , const size_t n5
+                  , const size_t n6 , const size_t n7
+                  , const size_t i0 , const size_t i1
+                  , const size_t i2 , const size_t i3
+                  , const size_t i4 , const size_t i5
+                  , const size_t i6 , const size_t i7 );
+};
+
+void ViewOperatorBoundsErrorAbort< Kokkos::Experimental::OpenMPTargetSpace >::
+apply( const size_t rank
+     , const size_t n0 , const size_t n1
+     , const size_t n2 , const size_t n3
+     , const size_t n4 , const size_t n5
+     , const size_t n6 , const size_t n7
+     , const size_t i0 , const size_t i1
+     , const size_t i2 , const size_t i3
+     , const size_t i4 , const size_t i5
+     , const size_t i6 , const size_t i7 )
+{
+  printf( "View operator bounds error : rank(%lu) dim(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu) index(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu)"
+          , rank , n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7
+                 , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+  //Kokkos::Impl::throw_runtime_exception( buffer );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/*
+namespace Kokkos {
+namespace {
+  const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF;
+  const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1];
+}
+
+namespace Impl {
+void init_lock_array_host_space() {
+  static int is_initialized = 0;
+  if(! is_initialized)
+    for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++)
+      HOST_SPACE_ATOMIC_LOCKS[i] = 0;
+}
+
+bool lock_address_host_space(void* ptr) {
+  return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                                  0 , 1);
+}
+
+void unlock_address_host_space(void* ptr) {
+   atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                    0);
+}
+
+}
+}*/
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8991c6394701d9a5219cb168c909bbaa8a178126
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@@ -0,0 +1,273 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <limits>
+#include <iostream>
+#include <vector>
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <iostream>
+#include <impl/Kokkos_CPUDiscovery.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel();
+
+int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel()
+{
+#ifndef __CUDA_ARCH__
+  return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
+#else
+  return 0;
+#endif
+}
+
+bool s_using_hwloc = false;
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+
+namespace Kokkos {
+namespace Experimental {
+bool OpenMPTarget::m_is_initialized = false;
+}
+}
+
+namespace Kokkos {
+namespace Impl {
+
+
+//int OpenMPTargetExec::m_map_rank[ OpenMPTargetExec::MAX_THREAD_COUNT ] = { 0 };
+
+//int OpenMPTargetExec::m_pool_topo[ 4 ] = { 0 };
+
+//OpenMPTargetExec * OpenMPTargetExec::m_pool[ OpenMPTargetExec::MAX_THREAD_COUNT ] = { 0 };
+
+void OpenMPTargetExec::verify_is_process( const char * const label )
+{
+  if ( omp_in_parallel() ) {
+    std::string msg( label );
+    msg.append( " ERROR: in parallel" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void OpenMPTargetExec::verify_initialized( const char * const label )
+{
+  if ( 0 == Kokkos::Experimental::OpenMPTarget::is_initialized() ) {
+    std::string msg( label );
+    msg.append( " ERROR: not initialized" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+  if ( omp_get_max_threads() != Kokkos::Experimental::OpenMPTarget::thread_pool_size(0) ) {
+    std::string msg( label );
+    msg.append( " ERROR: Initialized but threads modified inappropriately" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+}
+
+void*    OpenMPTargetExec::m_scratch_ptr  = NULL;
+int64_t OpenMPTargetExec::m_scratch_size = 0;
+
+void OpenMPTargetExec::clear_scratch()
+{
+  Kokkos::Experimental::OpenMPTargetSpace space;
+  space.deallocate(m_scratch_ptr,m_scratch_size);
+  m_scratch_ptr = NULL;
+  m_scratch_size = NULL;
+}
+
+void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; }
+
+void OpenMPTargetExec::resize_scratch( int64_t reduce_bytes , 
+                                       int64_t team_reduce_bytes, 
+                                       int64_t team_shared_bytes, int64_t thread_local_bytes) 
+{
+  Kokkos::Experimental::OpenMPTargetSpace space;
+  uint64_t total_size = MAX_ACTIVE_TEAMS * reduce_bytes +            // Inter Team Reduction  
+                        MAX_ACTIVE_TEAMS * team_reduce_bytes  +    // Intra Team Reduction
+                        MAX_ACTIVE_TEAMS * team_shared_bytes +       // Team Local Scratch
+                        MAX_ACTIVE_THREADS * thread_local_bytes;     // Thread Private Scratch
+
+  if( total_size > m_scratch_size ) {
+    space.deallocate(m_scratch_ptr,m_scratch_size);
+    m_scratch_size = total_size;
+    m_scratch_ptr = space.allocate(total_size);
+  }
+
+}
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+//----------------------------------------------------------------------------
+
+int OpenMPTarget::is_initialized()
+{ return m_is_initialized; }// != Impl::OpenMPTargetExec::m_pool[0]; }
+
+void OpenMPTarget::initialize( unsigned thread_count ,
+                         unsigned use_numa_count ,
+                         unsigned use_cores_per_numa )
+{
+  // Before any other call to OMP query the maximum number of threads
+  // and save the value for re-initialization unit testing.
+
+
+  // Init the array for used for arbitrarily sized atomics
+  Kokkos::Impl::init_lock_array_host_space();
+
+  #ifdef KOKKOS_ENABLE_PROFILING
+    Kokkos::Profiling::initialize();
+  #endif
+  m_is_initialized = true;
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMPTarget::finalize()
+{
+  Kokkos::Impl::OpenMPTargetExec::verify_initialized( "OpenMPTarget::finalize" );
+  Kokkos::Impl::OpenMPTargetExec::verify_is_process( "OpenMPTarget::finalize" );
+
+  m_is_initialized = false;
+
+  omp_set_num_threads(1);
+
+  if ( Kokkos::Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
+    hwloc::unbind_this_thread();
+  }
+
+  #ifdef KOKKOS_ENABLE_PROFILING
+    Kokkos::Profiling::finalize();
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMPTarget::print_configuration( std::ostream & s , const bool detail )
+{
+  Kokkos::Impl::OpenMPTargetExec::verify_is_process( "OpenMPTarget::print_configuration" );
+/*
+  s << "Kokkos::Experimental::OpenMPTarget" ;
+
+#if defined( KOKKOS_ENABLE_OPENMPTARGET )
+  s << " KOKKOS_ENABLE_OPENMPTARGET" ;
+#endif
+#if defined( KOKKOS_HAVE_HWLOC )
+
+  const unsigned numa_count_       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
+    << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
+    ;
+#endif
+
+  const bool is_initialized = 0 != Impl::OpenMPTargetExec::m_pool[0] ;
+
+  if ( is_initialized ) {
+    const int numa_count      = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[0] / Kokkos::Impl::OpenMPTargetExec::m_pool_topo[1] ;
+    const int core_per_numa   = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[1] / Kokkos::Impl::OpenMPTargetExec::m_pool_topo[2] ;
+    const int thread_per_core = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[2] ;
+
+    s << " thread_pool_topology[ " << numa_count
+      << " x " << core_per_numa
+      << " x " << thread_per_core
+      << " ]"
+      << std::endl ;
+
+    if ( detail ) {
+      std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPTargetExec::m_pool_topo[0] );
+
+#pragma omp parallel
+      {
+#pragma omp critical
+        {
+          coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
+        }
+// END #pragma omp critical 
+      }
+// END #pragma omp parallel 
+
+      for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
+        s << "  thread omp_rank[" << i << "]"
+          << " kokkos_rank[" << Impl::OpenMPTargetExec::m_map_rank[ i ] << "]"
+          << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
+          << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+*/
+}
+
+int OpenMPTarget::concurrency() {
+  return thread_pool_size(0);
+}
+
+const char* OpenMPTarget::name() { return "OpenMPTarget"; }
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif //KOKKOS_ENABLE_OPENMPTARGET
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..147916395e4c4282924ced14d4d15e79a02412a2
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
@@ -0,0 +1,753 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPTARGETEXEC_HPP
+#define KOKKOS_OPENMPTARGETEXEC_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
+
+#include <Kokkos_Atomic.hpp>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Data for OpenMPTarget thread execution */
+
+
+class OpenMPTargetExec {
+public:
+  enum { MAX_ACTIVE_THREADS = 256*8*56*4 };
+  enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS/32 };
+
+private:
+  static void* scratch_ptr;
+
+public:
+  static void verify_is_process( const char * const );
+  static void verify_initialized( const char * const );
+
+  static void* get_scratch_ptr();
+  static void clear_scratch();
+  static void resize_scratch( int64_t reduce_bytes , int64_t team_reduce_bytes, int64_t team_shared_bytes, int64_t thread_local_bytes );
+
+  static void* m_scratch_ptr;
+  static int64_t m_scratch_size;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class OpenMPTargetExecTeamMember {
+public:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  /** \brief  Thread states for team synchronization */
+  enum { Active = 0 , Rendezvous = 1 };
+
+  typedef Kokkos::Experimental::OpenMPTarget                         execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+  scratch_memory_space  m_team_shared ;
+  int                   m_team_scratch_size[2] ;
+  int                   m_team_rank ;
+  int                   m_team_size ;
+  int                   m_league_rank ;
+  int                   m_league_size ;
+  int                   m_vector_length ;
+  int                   m_vector_lane ;
+  void* 		m_glb_scratch ;
+
+  /*
+  // Fan-in team threads, root of the fan-in which does not block returns true
+  inline
+  bool team_fan_in() const
+    {
+      memory_fence();
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+
+        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
+      }
+
+      if ( m_team_rank_rev ) {
+        m_exec.state_set( Rendezvous );
+        memory_fence();
+        m_exec.state_wait( Rendezvous );
+      }
+
+      return 0 == m_team_rank_rev ;
+    }
+
+  inline
+  void team_fan_out() const
+    {
+      memory_fence();
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+        m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
+        memory_fence();
+      }
+    }
+  */
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& team_shmem() const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& team_scratch(int) const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& thread_scratch(int) const
+    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+    {
+      #pragma omp barrier
+    }
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(ValueType& value, const int& thread_id) const
+  {
+/*#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { }
+#else
+    // Make sure there is enough scratch space:
+    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
+                         , ValueType , void >::type type ;
+
+    type * const local_value = ((type*) m_exec.scratch_thread());
+    if(team_rank() == thread_id)
+      *local_value = value;
+    memory_fence();
+    team_barrier();
+    value = *local_value;
+#endif*/
+  }
+
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION ValueType
+    team_reduce( const ValueType & value
+               , const JoinOp & op_in ) const {
+
+      #pragma omp barrier
+
+      typedef ValueType value_type;
+      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
+
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
+                           , value_type , void >::type type ;
+
+      const int n_values = TEAM_REDUCE_SIZE/sizeof(value_type);
+      type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num());
+      for(int i = m_team_rank; i < n_values; i+= m_team_size) {
+        team_scratch[i] = value_type();
+      }
+
+      #pragma omp barrier
+
+      for(int k=0; k<m_team_size; k+=n_values) {
+        if((k <= m_team_rank) && (k+n_values > m_team_rank))
+          team_scratch[m_team_rank%n_values]+=value;
+        #pragma omp barrier
+      }
+
+      for(int d = 1; d<n_values;d*=2) {
+        if((m_team_rank+d<n_values) && (m_team_rank%(2*d)==0)) {
+          team_scratch[m_team_rank] += team_scratch[m_team_rank+d];
+        }
+        #pragma omp barrier
+      }
+      return team_scratch[0];
+    }
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
+    {
+    /*  // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
+
+      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
+
+      *work_value = value ;
+
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+            accum += val ;
+          }
+          accum = atomic_fetch_add( global_accum , accum );
+        }
+
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+          const type offset = accum ;
+          accum += val ;
+          val = offset ;
+        }
+
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *work_value ;*/
+      return ArgType();
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+    { return this-> template team_scan<Type>( value , 0 ); }
+
+  //----------------------------------------
+  // Private for the driver
+
+private:
+
+  typedef execution_space::scratch_memory_space space ;
+
+public:
+
+  inline
+  OpenMPTargetExecTeamMember( const int league_rank, const int league_size, const int team_size, const int vector_length //const TeamPolicyInternal< OpenMPTarget, Properties ...> & team
+                      , void* const glb_scratch
+                      , const int shmem_size_L1
+                      , const int shmem_size_L2
+                      )
+    : m_team_shared(0,0)
+    , m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
+    , m_team_rank(0)
+    , m_vector_length( vector_length )
+    , m_team_size( team_size )
+    , m_league_rank( league_rank )
+    , m_league_size( league_size )
+    , m_glb_scratch( glb_scratch )
+    {
+      const int omp_tid = omp_get_thread_num();
+      m_league_rank = league_rank;
+      m_team_rank = omp_tid/m_vector_length;
+      m_vector_lane = omp_tid%m_vector_length;
+    }
+
+  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
+};
+
+
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... >: public PolicyTraits<Properties ...>
+{
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_size = p.m_team_size;
+    m_vector_length = p.m_vector_length;
+    m_team_alloc = p.m_team_alloc;
+    m_team_iter = p.m_team_iter;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return 1024; }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & )
+    { return 256; }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& vector_length)
+    { return 256/vector_length; }
+
+  //----------------------------------------
+
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_vector_length;
+  int m_team_alloc ;
+  int m_team_iter ;
+
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+
+  int m_chunk_size;
+
+  inline void init( const int league_size_request
+                  , const int team_size_request
+                  , const int vector_length_request )
+    {
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request;
+
+      m_vector_length = vector_length_request;
+
+      set_auto_chunk_size();
+    }
+
+public:
+
+  inline int vector_length() const { return m_vector_length ; }
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
+    if(team_size_ < 0)
+      team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , int team_size_request
+            , int vector_length_request = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request , vector_length_request); }
+
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int vector_length_request = 1)
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , 256/vector_length_request , vector_length_request ); }
+
+  TeamPolicyInternal( int league_size_request
+            , int team_size_request
+            , int vector_length_request = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request , vector_length_request); }
+
+  TeamPolicyInternal( int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int vector_length_request = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , 256/vector_length_request , vector_length_request ); }
+
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int team_iter()  const { return m_team_iter ; }
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+protected:
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal internal_set_chunk_size(typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerThreadValue& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) {
+    m_team_scratch_size[level] = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+
+private:
+  /** \brief finalize chunk_size if it was set to AUTO*/
+  inline void set_auto_chunk_size() {
+
+    int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
+    if( concurrency==0 ) concurrency=1;
+
+    if(m_chunk_size > 0) {
+      if(!Impl::is_integral_power_of_two( m_chunk_size ))
+        Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
+    }
+
+    int new_chunk_size = 1;
+    while(new_chunk_size*100*concurrency < m_league_size)
+      new_chunk_size *= 2;
+    if(new_chunk_size < 128) {
+      new_chunk_size = 1;
+      while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
+        new_chunk_size*=2;
+    }
+    m_chunk_size = new_chunk_size;
+  }
+
+public:
+  typedef Impl::OpenMPTargetExecTeamMember member_type ;
+};
+} // namespace Impl
+
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+inline
+int OpenMPTarget::thread_pool_size( int depth )
+{
+  //return Impl::OpenMPTargetExec::pool_size(depth);
+  return omp_get_max_threads();
+}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMPTarget::thread_pool_rank()
+{
+  return omp_get_thread_num();
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
+  TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
+  TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >
+  ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> PerTeam(const Impl::OpenMPTargetExecTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> PerThread(const Impl::OpenMPTargetExecTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  //result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  //init_result = loop_boundaries.thread.team_reduce(result,join);
+}
+
+} //namespace Kokkos
+
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+}
+
+#endif /* #ifndef KOKKOS_OPENMPTARGETEXEC_HPP */
+
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..33b0749e3f0c5a29feef65d161117c292cc08dff
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@@ -0,0 +1,769 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPTARGET_PARALLEL_HPP
+#define KOKKOS_OPENMPTARGET_PARALLEL_HPP
+
+#include <omp.h>
+#include <iostream>
+#include <Kokkos_Parallel.hpp>
+#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Traits ... >
+                 , Kokkos::Experimental::OpenMPTarget 
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ...  > Policy ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  const FunctorType m_functor ;
+  const Policy      m_policy ;
+
+
+public:
+
+  inline void execute() const {
+    execute_impl<WorkTag>();
+  }
+
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  execute_impl() const
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename Policy::member_type begin = m_policy.begin();
+      const typename Policy::member_type end = m_policy.end();
+      
+      #pragma omp target teams distribute parallel for map(to:this->m_functor)
+      for(int i=begin; i<end; i++)
+        m_functor(i);
+    }
+
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  execute_impl() const
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename Policy::member_type begin = m_policy.begin();
+      const typename Policy::member_type end = m_policy.end();
+
+      #pragma omp target teams distribute parallel for num_threads(128) map(to:this->m_functor)
+      for(int i=begin; i<end; i++)
+        m_functor(TagType(),i);
+    }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , Policy arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<class FunctorType, class PolicyType, class ReducerType, class PointerType, class ValueType, int FunctorHasJoin, int UseReducerType>
+struct ParallelReduceSpecialize {
+  static inline void execute(const FunctorType& f, const PolicyType& p , PointerType result_ptr) {
+    printf("Error: Invalid Specialization %i %i\n",FunctorHasJoin,UseReducerType);
+  }
+};
+
+template<class FunctorType, class ReducerType, class PointerType, class ValueType, class ... PolicyArgs>
+struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, ReducerType, PointerType, ValueType, 0,0> {
+  typedef Kokkos::RangePolicy<PolicyArgs...> PolicyType;
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename PolicyType::member_type begin = p.begin();
+      const typename PolicyType::member_type end = p.end();
+      
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(+: result)
+      for(int i=begin; i<end; i++)
+        f(i,result);
+
+      *result_ptr=result;
+    }
+
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename PolicyType::member_type begin = p.begin();
+      const typename PolicyType::member_type end = p.end();
+
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(+: result)
+      for(int i=begin; i<end; i++)
+        f(TagType(),i,result);
+      
+      *result_ptr=result;
+    }
+
+
+    inline static
+    void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
+      execute_impl<typename PolicyType::work_tag>(f,p,ptr);
+    }
+};
+/*
+template<class FunctorType, class PolicyType, class ReducerType, class PointerType, class ValueType>
+struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType, PointerType, ValueType, 0,1> {
+
+  #pragma omp declare reduction(custom: ValueType : ReducerType::join(omp_out, omp_in)) initializer ( ReducerType::init(omp_priv) )
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename PolicyType::member_type begin = p.begin();
+      const typename PolicyType::member_type end = p.end();
+
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(custom: result)
+      for(int i=begin; i<end; i++)
+        f(i,result);
+
+      *result_ptr=result;
+    }
+
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const typename PolicyType::member_type begin = p.begin();
+      const typename PolicyType::member_type end = p.end();
+
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(custom: result)
+      for(int i=begin; i<end; i++)
+        f(TagType(),i,result);
+
+      *result_ptr=result;
+    }
+
+
+    inline static
+    void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
+      execute_impl<typename PolicyType::work_tag>(f,p,ptr);
+    }
+};
+*/
+
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ...>
+                    , ReducerType
+                    , Kokkos::Experimental::OpenMPTarget
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  // Static Assert WorkTag void if ReducerType not InvalidType
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTagFwd > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd , WorkTagFwd > ValueJoin ;
+
+  enum {HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
+  enum {UseReducer = is_reducer_type<ReducerType>::value };
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  
+  typedef ParallelReduceSpecialize<FunctorType,Policy,ReducerType,pointer_type,typename ValueTraits::value_type,HasJoin,UseReducer> ParForSpecialize;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+public: 
+  inline void execute() const {
+    ParForSpecialize::execute(m_functor,m_policy,m_result_ptr);    
+  }
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ViewType    & arg_result_view
+                , typename std::enable_if<
+                           Kokkos::is_view< ViewType >::value &&
+                           !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr(  arg_result_view.data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::Experimental::OpenMPTarget
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType, WorkTag > ValueJoin ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType, WorkTag > ValueOps ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+/*
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( iwork , update , final );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      const TagType t{} ;
+      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( t , iwork , update , final );
+      }
+    }
+*/
+public:
+
+  inline
+  void execute() const
+    {
+/*      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_scan");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_scan");
+
+      OpenMPTargetExec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
+
+#pragma omp parallel
+      {
+        OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
+        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        const pointer_type ptr =
+          pointer_type( exec.scratch_reduce() ) +
+          ValueTraits::value_count( m_functor );
+        ParallelScan::template exec_range< WorkTag >
+          ( m_functor , range.begin() , range.end()
+          , ValueInit::init( m_functor , ptr ) , false );
+      }
+
+      {
+        const unsigned thread_count = OpenMPTargetExec::pool_size();
+        const unsigned value_count  = ValueTraits::value_count( m_functor );
+
+        pointer_type ptr_prev = 0 ;
+
+        for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
+
+          pointer_type ptr = pointer_type( OpenMPTargetExec::pool_rev(rank_rev)->scratch_reduce() );
+
+          if ( ptr_prev ) {
+            for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
+            ValueJoin::join( m_functor , ptr + value_count , ptr );
+          }
+          else {
+            ValueInit::init( m_functor , ptr );
+          }
+
+          ptr_prev = ptr ;
+        }
+      }
+
+#pragma omp parallel
+      {
+        OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
+        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        const pointer_type ptr = pointer_type( exec.scratch_reduce() );
+        ParallelScan::template exec_range< WorkTag >
+          ( m_functor , range.begin() , range.end()
+          , ValueOps::reference( ptr ) , true );
+      }
+*/
+    }
+
+  //----------------------------------------
+
+  inline
+  ParallelScan( const FunctorType & arg_functor
+              , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+  {}
+
+  //----------------------------------------
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , Kokkos::TeamPolicy< Properties ... >
+                 , Kokkos::Experimental::OpenMPTarget
+                 >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... > Policy ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const int          m_shmem_size ;
+
+public:
+
+  inline void execute() const {
+    OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+    OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+    execute_impl<WorkTag>();
+  }
+
+private:
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  execute_impl() const
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const int league_size = m_policy.league_size();
+      const int team_size = m_policy.team_size();
+      const int vector_length = m_policy.vector_length();
+      const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
+
+      OpenMPTargetExec::resize_scratch(0,Policy::member_type::TEAM_REDUCE_SIZE,0,0);
+      void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+      #pragma omp target teams distribute parallel for num_teams(league_size) num_threads(team_size*vector_length) schedule(static,1) \
+          map(to:this->m_functor,scratch_ptr) 
+      for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
+        typename Policy::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
+        m_functor(team);
+      }
+    }
+
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  execute_impl() const
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      const int league_size = m_policy.league_size();
+      const int team_size = m_policy.team_size();
+      const int vector_length = m_policy.vector_length();
+      const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
+
+      OpenMPTargetExec::resize_scratch(0,Policy::member_type::TEAM_REDUCE_SIZE,0,0);
+      void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+      #pragma omp target teams distribute parallel for num_teams(league_size) num_threads(team_size*vector_length) schedule(static,1) \
+         map(to:this->m_functor,scratch_ptr)
+      for(int i=0 ; i<league_size ; i++) {
+        typename Policy::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
+        m_functor(TagType(), team);
+      }
+    }
+
+public:
+
+  inline
+  ParallelFor( const FunctorType & arg_functor ,
+               const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    {}
+};
+
+template<class FunctorType, class ReducerType, class PointerType, class ValueType, class ... PolicyArgs>
+struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, ReducerType, PointerType, ValueType, 0,0> {
+  typedef TeamPolicyInternal<PolicyArgs...> PolicyType;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+      
+      const int league_size = p.league_size();
+      const int team_size = p.team_size();
+      const int vector_length = p.vector_length();
+      const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
+      
+      OpenMPTargetExec::resize_scratch(0,PolicyType::member_type::TEAM_REDUCE_SIZE,0,0);
+      void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); 
+
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(nteams) num_threads(team_size*vector_length) \
+         map(to:f,scratch_ptr) map(tofrom:result) reduction(+: result) schedule(static,1)
+      for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
+        typename PolicyType::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
+        f(team,result);
+        if(team.m_vector_lane!=0) result = 0;
+      }
+
+      *result_ptr=result;
+    }
+
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
+    {
+      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
+      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
+
+      const int league_size = p.league_size();
+      const int team_size = p.team_size();
+      const int vector_length = p.vector_length();
+      const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
+
+      OpenMPTargetExec::resize_scratch(0,PolicyType::member_type::TEAM_REDUCE_SIZE,0,0);
+      void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+      ValueType result = ValueType();
+      #pragma omp target teams distribute parallel for num_teams(nteams) num_threads(team_size*vector_length) \
+         map(to:f,scratch_ptr) map(tofrom:result) reduction(+: result) schedule(static,1)
+      for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
+        typename PolicyType::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
+        f(TagType(),team,result);
+        if(team.vector_lane!=0) result = 0;
+      }
+      *result_ptr=result;
+    }
+
+
+    inline static
+    void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
+      execute_impl<typename PolicyType::work_tag>(f,p,ptr);
+    }
+};
+
+
+template< class FunctorType , class ReducerType, class ... Properties >
+class ParallelReduce< FunctorType
+                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
+                    , Kokkos::Experimental::OpenMPTarget
+                    >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... >         Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTagFwd >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd , WorkTagFwd >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+
+  enum {HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
+  enum {UseReducer = is_reducer_type<ReducerType>::value };
+
+  typedef ParallelReduceSpecialize<FunctorType,Policy,ReducerType,pointer_type,typename ValueTraits::value_type,HasJoin,UseReducer> ParForSpecialize;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const ReducerType  m_reducer ;
+  const pointer_type m_result_ptr ;
+  const int          m_shmem_size ;
+
+public:
+
+  inline
+  void execute() const {
+    ParForSpecialize::execute(m_functor,m_policy,m_result_ptr);   
+  }
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const ViewType     & arg_result ,
+                  typename std::enable_if<
+                    Kokkos::is_view< ViewType >::value &&
+                    !Kokkos::is_reducer_type<ReducerType>::value
+                    ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result.ptr_on_device() )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    {}
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.result_view().data() )
+  , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
+  }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+
+namespace Kokkos {
+namespace Impl {
+
+  template<typename iType>
+  struct TeamThreadRangeBoundariesStruct<iType,OpenMPTargetExecTeamMember> {
+    typedef iType index_type;
+    const iType start;
+    const iType end;
+    const iType increment;
+
+    inline
+    TeamThreadRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& count):
+      start( thread_.team_rank() ),
+      end( count ),
+      increment( thread_.team_size() )
+    {}
+    inline
+    TeamThreadRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& begin_, const iType& end_):
+      start( begin_+thread_.team_rank() ),
+      end( end_ ),
+      increment( thread_.team_size() )
+    {}
+  };
+
+  template<typename iType>
+  struct ThreadVectorRangeBoundariesStruct<iType,OpenMPTargetExecTeamMember> {
+    typedef iType index_type;
+    const iType start;
+    const iType end;
+    const iType increment;
+
+    inline
+    ThreadVectorRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& count):
+      start( thread_.m_vector_lane ),
+      end( count ),
+      increment( thread_.m_vector_length )
+    {}
+    inline
+    ThreadVectorRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& begin_, const iType& end_):
+      start( begin_+thread_.m_vector_lane ),
+      end( end_ ),
+      increment( thread_.m_vector_length )
+    {}
+  };
+
+  template<typename iType>
+  KOKKOS_INLINE_FUNCTION
+  Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
+    TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
+    return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,count);
+  }
+  
+  template<typename iType>
+  KOKKOS_INLINE_FUNCTION
+  Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
+    TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
+    return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
+  }
+
+  template<typename iType>
+  KOKKOS_INLINE_FUNCTION
+  Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >
+    ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
+    return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >(thread,count);
+  }
+
+  template<typename iType>
+  KOKKOS_INLINE_FUNCTION
+  Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
+    ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
+    return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
+  }
+
+}
+
+}
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */
+
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dfd45486ef268c170fbb29a49dc6ac226424c832
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
@@ -0,0 +1,329 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
+
+//----------------------------------------------------------------------------
+
+TaskExec< Kokkos::Experimental::OpenMPTarget >::
+TaskExec()
+  : m_self_exec( 0 )
+  , m_team_exec( 0 )
+  , m_sync_mask( 0 )
+  , m_sync_value( 0 )
+  , m_sync_step( 0 )
+  , m_group_rank( 0 )
+  , m_team_rank( 0 )
+  , m_team_size( 1 )
+{
+}
+
+TaskExec< Kokkos::Experimental::OpenMPTarget >::
+TaskExec( Kokkos::Impl::OpenMPTargetExec & arg_exec , int const arg_team_size )
+  : m_self_exec( & arg_exec )
+  , m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
+  , m_sync_mask( 0 )
+  , m_sync_value( 0 )
+  , m_sync_step( 0 )
+  , m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
+  , m_team_rank(  arg_exec.pool_rank_rev() % arg_team_size )
+  , m_team_size(  arg_team_size )
+{
+  // This team spans
+  //    m_self_exec->pool_rev( team_size * group_rank )
+  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
+
+  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
+
+  sync[0] = int64_t(0) ;
+  sync[1] = int64_t(0) ;
+
+  for ( int i = 0 ; i < m_team_size ; ++i ) {
+    m_sync_value |= int64_t(1) << (8*i);
+    m_sync_mask  |= int64_t(3) << (8*i);
+  }
+
+  Kokkos::memory_fence();
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void TaskExec< Kokkos::Experimental::OpenMPTarget >::team_barrier_impl() const
+{
+  if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
+    Kokkos::abort("TaskQueue<OpenMPTarget> scratch_reduce memory too small");
+  }
+
+  // Use team shared memory to synchronize.
+  // Alternate memory locations between barriers to avoid a sequence
+  // of barriers overtaking one another.
+
+  int64_t volatile * const sync =
+    ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
+
+  // This team member sets one byte within the sync variable
+  int8_t volatile * const sync_self =
+   ((int8_t *) sync) + m_team_rank ;
+
+#if 0
+fprintf( stdout
+       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
+       , m_group_rank
+       , m_team_rank
+       , m_sync_step
+       , m_sync_value
+       , *sync
+       );
+fflush(stdout);
+#endif
+
+  *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
+
+  while ( m_sync_value != *sync ); // wait for team to arrive
+
+#if 0
+fprintf( stdout
+       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
+       , m_group_rank
+       , m_team_rank
+       , m_sync_step
+       , m_sync_value
+       , *sync
+       );
+fflush(stdout);
+#endif
+
+  ++m_sync_step ;
+
+  if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
+    m_sync_value ^= m_sync_mask ;
+    if ( 1000 < m_sync_step ) m_sync_step = 0 ;
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+void TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >::execute
+  ( TaskQueue< Kokkos::Experimental::OpenMPTarget > * const queue )
+{
+  using execution_space = Kokkos::Experimental::OpenMPTarget ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using PoolExec        = Kokkos::Impl::OpenMPTargetExec ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  // Required:  team_size <= 8
+
+  const int team_size = PoolExec::pool_size(2); // Threads per core
+  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+
+  if ( 8 < team_size ) {
+    Kokkos::abort("TaskQueue<OpenMPTarget> unsupported team size");
+  }
+
+#pragma omp parallel
+  {
+    PoolExec & self = *PoolExec::get_thread_omp();
+
+    Member single_exec ;
+    Member team_exec( self , team_size );
+
+    // Team shared memory
+    task_root_type * volatile * const task_shared =
+      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+
+// Barrier across entire OpenMPTarget thread pool to insure initialization
+#pragma omp barrier
+
+    // Loop until all queues are empty and no tasks in flight
+
+    do {
+
+      task_root_type * task = 0 ;
+
+      // Each team lead attempts to acquire either a thread team task
+      // or a single thread task for the team.
+
+      if ( 0 == team_exec.team_rank() ) {
+
+        task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+        // Loop by priority and then type
+        for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+          for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+            task = queue_type::pop_task( & queue->m_ready[i][j] );
+          }
+        }
+      }
+
+      // Team lead broadcast acquired task to team members:
+
+      if ( 1 < team_exec.team_size() ) {
+
+        if ( 0 == team_exec.team_rank() ) *task_shared = task ;
+
+        // Fence to be sure task_shared is stored before the barrier
+        Kokkos::memory_fence();
+
+        // Whole team waits for every team member to reach this statement
+        team_exec.team_barrier();
+
+        // Fence to be sure task_shared is stored
+        Kokkos::memory_fence();
+
+        task = *task_shared ;
+      }
+
+#if 0
+fprintf( stdout
+       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
+       , team_exec.m_group_rank
+       , team_exec.m_team_rank
+       , uintptr_t(task_shared)
+       , uintptr_t(task)
+       );
+fflush(stdout);
+#endif
+
+      if ( 0 == task ) break ; // 0 == m_ready_count
+
+      if ( end == task ) {
+        // All team members wait for whole team to reach this statement.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+      else if ( task_root_type::TaskTeam == task->m_task_type ) {
+        // Thread Team Task
+        (*task->m_apply)( task , & team_exec );
+
+        // The m_apply function performs a barrier
+
+        if ( 0 == team_exec.team_rank() ) {
+          // team member #0 completes the task, which may delete the task
+          queue->complete( task ); 
+        }
+      }
+      else {
+        // Single Thread Task
+
+        if ( 0 == team_exec.team_rank() ) {
+
+          (*task->m_apply)( task , & single_exec );
+
+          queue->complete( task ); 
+        }
+
+        // All team members wait for whole team to reach this statement.
+        // Not necessary to complete the task.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+    } while(1);
+  }
+// END #pragma omp parallel
+
+}
+
+void TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >::
+  iff_single_thread_recursive_execute
+    ( TaskQueue< Kokkos::Experimental::OpenMPTarget > * const queue )
+{
+  using execution_space = Kokkos::Experimental::OpenMPTarget ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  if ( 1 == omp_get_num_threads() ) {
+
+    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+    Member single_exec ;
+
+    task_root_type * task = end ;
+
+    do {
+
+      task = end ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+          task = queue_type::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+      if ( end == task ) break ;
+
+      (*task->m_apply)( task , & single_exec );
+
+      queue->complete( task ); 
+
+    } while(1);
+  }
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe9ac4abb4a8521b2d02e6a943aaf2fa05353cc5
--- /dev/null
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp
@@ -0,0 +1,356 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
+#define KOKKOS_IMPL_OPENMP_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >
+{
+public:
+
+  using execution_space = Kokkos::Experimental::OpenMPTarget ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  // Must provide mechanism to set function pointer in
+  // execution space from the host process.
+  template< typename FunctorType >
+  static
+  void proc_set_apply( task_base_type::function_type * ptr )
+    {
+      using TaskType = TaskBase< Kokkos::Experimental::OpenMPTarget
+                               , typename FunctorType::value_type
+                               , FunctorType
+                               > ;
+       *ptr = TaskType::apply ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskExec< Kokkos::Experimental::OpenMPTarget >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+
+  using PoolExec = Kokkos::Impl::OpenMPTargetExec ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget > ;
+
+  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure 
+  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
+  int64_t          m_sync_mask ;
+  int64_t mutable  m_sync_value ;
+  int     mutable  m_sync_step ;
+  int              m_group_rank ; ///< Which "team" subset of thread pool
+  int              m_team_rank ;  ///< Which thread within a team
+  int              m_team_size ;
+
+  TaskExec();
+  TaskExec( PoolExec & arg_exec , int arg_team_size );
+
+  void team_barrier_impl() const ;
+
+public:
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  void * team_shared() const
+    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
+
+  int team_shared_size() const
+    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
+
+  /**\brief  Whole team enters this function call
+   *         before any teeam member returns from
+   *         this function call.
+   */
+  void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
+#else
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const { return m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const { return m_team_size ; }
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >
+TeamThreadRange
+  ( Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > & thread
+  , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >
+TeamThreadRange
+  ( Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > & thread
+  , const iType & start
+  , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >(thread,start,end);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.
+*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries
+  , const Lambda& lambda
+  )
+{
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+template<typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries
+  , const Lambda& lambda
+  , ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+
+    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+
+    loop_boundaries.thread.team_barrier();
+    shared[team_rank] = result;
+
+    loop_boundaries.thread.team_barrier();
+
+    // reduce across threads to thread 0
+    if (team_rank == 0) {
+      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
+        shared[0] += shared[i];
+      }
+    }
+
+    loop_boundaries.thread.team_barrier();
+
+    // broadcast result
+    initialized_result = shared[0];
+  }
+  else {
+    initialized_result = result ;
+  }
+}
+
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+
+    loop_boundaries.thread.team_barrier();
+    shared[team_rank] = result;
+
+    loop_boundaries.thread.team_barrier();
+
+    // reduce across threads to thread 0
+    if (team_rank == 0) {
+      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
+        join(shared[0], shared[i]);
+      }
+    }
+
+    loop_boundaries.thread.team_barrier();
+
+    // broadcast result
+    initialized_result = shared[0];
+  }
+  else {
+    initialized_result = result ;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result)
+{
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+}
+
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  ValueType accum = 0 ;
+  ValueType val, local_total;
+  ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+  int team_size = loop_boundaries.thread.team_size();
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+
+  // Intra-member scan
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+
+  shared[team_rank] = accum;
+  loop_boundaries.thread.team_barrier();
+
+  // Member 0 do scan on accumulated totals
+  if (team_rank == 0) {
+    for( iType i = 1; i < team_size; i+=1) {
+      shared[i] += shared[i-1];
+    }
+    accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
+  }
+
+  loop_boundaries.thread.team_barrier();
+
+  // Inter-member scan adding in accumulated totals
+  if (team_rank != 0) { accum = shared[team_rank-1]; }
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
+   const Lambda & lambda)
+{
+}
+
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
+
diff --git a/packages/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp b/packages/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b37a4c63b93b7bf2ee45679c609ae0637f898e1d
--- /dev/null
+++ b/packages/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
@@ -0,0 +1,525 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS )
+
+#include <Kokkos_Core_fwd.hpp>
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <utility>
+
+#include <Kokkos_Qthreads.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+// Defines to enable experimental Qthreads functionality.
+//#define QTHREAD_LOCAL_PRIORITY
+//#define CLONED_TASKS
+
+//#include <qthread.h>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+namespace {
+
+enum { MAXIMUM_QTHREADS_WORKERS = 1024 };
+
+/** s_exec is indexed by the reverse rank of the workers
+ *  for faster fan-in / fan-out lookups
+ *  [ n - 1, n - 2, ..., 0 ]
+ */
+QthreadsExec * s_exec[ MAXIMUM_QTHREADS_WORKERS ];
+
+int  s_number_shepherds            = 0;
+int  s_number_workers_per_shepherd = 0;
+int  s_number_workers              = 0;
+
+inline
+QthreadsExec ** worker_exec()
+{
+  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local( NULL ) + 1 );
+}
+
+const int s_base_size = QthreadsExec::align_alloc( sizeof(QthreadsExec) );
+
+int s_worker_reduce_end   = 0;  // End of worker reduction memory.
+int s_worker_shared_end   = 0;  // Total of worker scratch memory.
+int s_worker_shared_begin = 0;  // Beginning of worker shared memory.
+
+QthreadsExecFunctionPointer volatile s_active_function     = 0;
+const void                * volatile s_active_function_arg = 0;
+
+} // namespace
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+int Qthreads::is_initialized()
+{
+  return Impl::s_number_workers != 0;
+}
+
+int Qthreads::concurrency()
+{
+  return Impl::s_number_workers_per_shepherd;
+}
+
+int Qthreads::in_parallel()
+{
+  return Impl::s_active_function != 0;
+}
+
+void Qthreads::initialize( int thread_count )
+{
+  // Environment variable: QTHREAD_NUM_SHEPHERDS
+  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
+  // Environment variable: QTHREAD_HWPAR
+
+  {
+    char buffer[256];
+    snprintf( buffer, sizeof(buffer), "QTHREAD_HWPAR=%d", thread_count );
+    putenv( buffer );
+  }
+
+  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
+                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local( NO_SHEPHERD ) ) &&
+                       ( thread_count    == qthread_num_workers() );
+
+  bool ok_symmetry = true;
+
+  if ( ok_init ) {
+    Impl::s_number_shepherds            = qthread_num_shepherds();
+    Impl::s_number_workers_per_shepherd = qthread_num_workers_local( NO_SHEPHERD );
+    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd;
+
+    for ( int i = 0; ok_symmetry && i < Impl::s_number_shepherds; ++i ) {
+      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local( i ) );
+    }
+  }
+
+  if ( ! ok_init || ! ok_symmetry ) {
+    std::ostringstream msg;
+
+    msg << "Kokkos::Qthreads::initialize(" << thread_count << ") FAILED";
+    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
+    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local( NO_SHEPHERD );
+    msg << " : qthread_num_workers = " << qthread_num_workers();
+
+    if ( ! ok_symmetry ) {
+      msg << " : qthread_num_workers_local = {";
+      for ( int i = 0; i < Impl::s_number_shepherds; ++i ) {
+        msg << " " << qthread_num_workers_local( i );
+      }
+      msg << " }";
+    }
+
+    Impl::s_number_workers              = 0;
+    Impl::s_number_shepherds            = 0;
+    Impl::s_number_workers_per_shepherd = 0;
+
+    if ( ok_init ) { qthread_finalize(); }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  Impl::QthreadsExec::resize_worker_scratch( 256, 256 );
+
+  // Init the array for used for arbitrarily sized atomics.
+  Impl::init_lock_array_host_space();
+
+}
+
+void Qthreads::finalize()
+{
+  Impl::QthreadsExec::clear_workers();
+
+  if ( Impl::s_number_workers ) {
+    qthread_finalize();
+  }
+
+  Impl::s_number_workers              = 0;
+  Impl::s_number_shepherds            = 0;
+  Impl::s_number_workers_per_shepherd = 0;
+}
+
+void Qthreads::print_configuration( std::ostream & s, const bool detail )
+{
+  s << "Kokkos::Qthreads {"
+    << " num_shepherds(" << Impl::s_number_shepherds << ")"
+    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
+    << " }" << std::endl;
+}
+
+Qthreads & Qthreads::instance( int )
+{
+  static Qthreads q;
+  return q;
+}
+
+void Qthreads::fence()
+{
+}
+
+int Qthreads::shepherd_size() const { return Impl::s_number_shepherds; }
+int Qthreads::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd; }
+
+const char* Qthreads::name() { return "Qthreads"; }
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+namespace {
+
+aligned_t driver_exec_all( void * arg )
+{
+  QthreadsExec & exec = **worker_exec();
+
+  (*s_active_function)( exec, s_active_function_arg );
+
+/*
+  fprintf( stdout
+         , "QthreadsExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , exec.worker_rank()
+         , exec.worker_size()
+         , exec.shepherd_rank()
+         , exec.shepherd_size()
+         , exec.shepherd_worker_rank()
+         , exec.shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  return 0;
+}
+
+aligned_t driver_resize_worker_scratch( void * arg )
+{
+  static volatile int lock_begin = 0;
+  static volatile int lock_end   = 0;
+
+  QthreadsExec ** const exec = worker_exec();
+
+  //----------------------------------------
+  // Serialize allocation for thread safety.
+
+  while ( ! atomic_compare_exchange_strong( & lock_begin, 0, 1 ) ); // Spin wait to claim lock.
+
+  const bool ok = 0 == *exec;
+
+  if ( ok ) { *exec = (QthreadsExec *) malloc( s_base_size + s_worker_shared_end ); }
+
+  lock_begin = 0; // Release lock.
+
+  if ( ok ) { new( *exec ) QthreadsExec(); }
+
+  //----------------------------------------
+  // Wait for all calls to complete to insure that each worker has executed.
+
+  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end, 1 ) ) { lock_end = 0; }
+
+  while ( lock_end );
+
+/*
+  fprintf( stdout
+         , "QthreadsExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , (**exec).worker_rank()
+         , (**exec).worker_size()
+         , (**exec).shepherd_rank()
+         , (**exec).shepherd_size()
+         , (**exec).shepherd_worker_rank()
+         , (**exec).shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  //----------------------------------------
+
+  if ( ! ok ) {
+    fprintf( stderr, "Kokkos::QthreadsExec resize failed\n" );
+    fflush( stderr );
+  }
+
+  return 0;
+}
+
+void verify_is_process( const char * const label, bool not_active = false )
+{
+  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local( NULL );
+  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
+
+  if ( not_process || is_active ) {
+    std::string msg( label );
+    msg.append( " : FAILED" );
+    if ( not_process ) msg.append(" : not called by main process");
+    if ( is_active )   msg.append(" : parallel execution in progress");
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+} // namespace
+
+int QthreadsExec::worker_per_shepherd()
+{
+  return s_number_workers_per_shepherd;
+}
+
+QthreadsExec::QthreadsExec()
+{
+  const int shepherd_rank        = qthread_shep();
+  const int shepherd_worker_rank = qthread_worker_local( NULL );
+  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank;
+
+  m_worker_base          = s_exec;
+  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
+  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size;
+  m_reduce_end           = s_worker_reduce_end;
+  m_shepherd_rank        = shepherd_rank;
+  m_shepherd_size        = s_number_shepherds;
+  m_shepherd_worker_rank = shepherd_worker_rank;
+  m_shepherd_worker_size = s_number_workers_per_shepherd;
+  m_worker_rank          = worker_rank;
+  m_worker_size          = s_number_workers;
+  m_worker_state         = QthreadsExec::Active;
+}
+
+void QthreadsExec::clear_workers()
+{
+  for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
+    QthreadsExec * const exec = s_exec[iwork];
+    s_exec[iwork] = 0;
+    free( exec );
+  }
+}
+
+void QthreadsExec::shared_reset( Qthreads::scratch_memory_space & space )
+{
+  new( & space )
+    Qthreads::scratch_memory_space(
+      ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin,
+      s_worker_shared_end - s_worker_shared_begin
+    );
+}
+
+void QthreadsExec::resize_worker_scratch( const int reduce_size, const int shared_size )
+{
+  const int exec_all_reduce_alloc = align_alloc( reduce_size );
+  const int shepherd_scan_alloc   = align_alloc( 8 );
+  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
+
+  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
+       s_worker_shared_end < shepherd_shared_end ) {
+
+/*
+  fprintf( stdout, "QthreadsExec::resize\n");
+  fflush(stdout);
+*/
+
+    // Clear current worker memory before allocating new worker memory.
+    clear_workers();
+
+    // Increase the buffers to an aligned allocation.
+    s_worker_reduce_end   = exec_all_reduce_alloc;
+    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc;
+    s_worker_shared_end   = shepherd_shared_end;
+
+    // Need to query which shepherd this main 'process' is running.
+
+    const int main_shep = qthread_shep();
+
+    // Have each worker resize its memory for proper first-touch.
+#if 0
+    for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+      for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i ) {
+        qthread_fork_to( driver_resize_worker_scratch, NULL, NULL, jshep );
+      }
+    }
+#else
+    // If this function is used before the 'qthreads.task_policy' unit test,
+    // the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
+    for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
+
+      if ( num_clone ) {
+        const int ret = qthread_fork_clones_to_local_priority
+          ( driver_resize_worker_scratch   // Function
+          , NULL                           // Function data block
+          , NULL                           // Pointer to return value feb
+          , jshep                          // Shepherd number
+          , num_clone - 1                  // Number of instances - 1
+          );
+
+        assert( ret == QTHREAD_SUCCESS );
+      }
+    }
+#endif
+
+    driver_resize_worker_scratch( NULL );
+
+    // Verify all workers allocated.
+
+    bool ok = true;
+    for ( int iwork = 0; ok && iwork < s_number_workers; ++iwork ) { ok = 0 != s_exec[iwork]; }
+
+    if ( ! ok ) {
+      std::ostringstream msg;
+      msg << "Kokkos::Impl::QthreadsExec::resize : FAILED for workers {";
+      for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
+         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
+      }
+      msg << " }";
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    }
+  }
+}
+
+void QthreadsExec::exec_all( Qthreads &, QthreadsExecFunctionPointer func, const void * arg )
+{
+  verify_is_process("QthreadsExec::exec_all(...)",true);
+
+/*
+  fprintf( stdout, "QthreadsExec::exec_all\n");
+  fflush(stdout);
+*/
+
+  s_active_function     = func;
+  s_active_function_arg = arg;
+
+  // Need to query which shepherd this main 'process' is running.
+
+  const int main_shep = qthread_shep();
+
+#if 0
+  for ( int jshep = 0, iwork = 0; jshep < s_number_shepherds; ++jshep ) {
+    for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i, ++iwork ) {
+      qthread_fork_to( driver_exec_all, NULL, NULL, jshep );
+    }
+  }
+#else
+  // If this function is used before the 'qthreads.task_policy' unit test,
+  // the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
+  for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+    const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
+
+    if ( num_clone ) {
+      const int ret = qthread_fork_clones_to_local_priority
+        ( driver_exec_all   // Function
+        , NULL              // Function data block
+        , NULL              // Pointer to return value feb
+        , jshep             // Shepherd number
+        , num_clone - 1     // Number of instances - 1
+        );
+
+      assert(ret == QTHREAD_SUCCESS);
+    }
+  }
+#endif
+
+  driver_exec_all( NULL );
+
+  s_active_function     = 0;
+  s_active_function_arg = 0;
+}
+
+void * QthreadsExec::exec_all_reduce_result()
+{
+  return s_exec[0]->m_scratch_alloc;
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+QthreadsTeamPolicyMember::QthreadsTeamPolicyMember()
+  : m_exec( **worker_exec() )
+  , m_team_shared( 0, 0 )
+  , m_team_size( 1 )
+  , m_team_rank( 0 )
+  , m_league_size( 1 )
+  , m_league_end( 1 )
+  , m_league_rank( 0 )
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+QthreadsTeamPolicyMember::QthreadsTeamPolicyMember( const QthreadsTeamPolicyMember::TaskTeam & )
+  : m_exec( **worker_exec() )
+  , m_team_shared( 0, 0 )
+  , m_team_size( s_number_workers_per_shepherd )
+  , m_team_rank( m_exec.shepherd_worker_rank() )
+  , m_league_size( 1 )
+  , m_league_end( 1 )
+  , m_league_rank( 0 )
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#else
+void KOKKOS_SRC_QTHREADS_EXEC_PREVENT_LINK_ERROR() {}
+#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
+
diff --git a/packages/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp b/packages/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2a3ebe4149909301ce5c5759d5c0676f333b413
--- /dev/null
+++ b/packages/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
@@ -0,0 +1,645 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADSEXEC_HPP
+#define KOKKOS_QTHREADSEXEC_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS )
+
+#include <impl/Kokkos_Spinwait.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+class QthreadsExec;
+
+typedef void (*QthreadsExecFunctionPointer)( QthreadsExec &, const void * );
+
+class QthreadsExec {
+private:
+  enum { Inactive = 0, Active = 1 };
+
+  const QthreadsExec * const * m_worker_base;
+  const QthreadsExec * const * m_shepherd_base;
+
+  void  * m_scratch_alloc;  ///< Scratch memory [ reduce, team, shared ]
+  int     m_reduce_end;     ///< End of scratch reduction memory
+
+  int     m_shepherd_rank;
+  int     m_shepherd_size;
+
+  int     m_shepherd_worker_rank;
+  int     m_shepherd_worker_size;
+
+  /*
+   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
+   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
+   */
+  int     m_worker_rank;
+  int     m_worker_size;
+
+  int mutable volatile m_worker_state;
+
+  friend class Kokkos::Qthreads;
+
+  ~QthreadsExec();
+  QthreadsExec( const QthreadsExec & );
+  QthreadsExec & operator = ( const QthreadsExec & );
+
+public:
+  QthreadsExec();
+
+  /** Execute the input function on all available Qthreads workers. */
+  static void exec_all( Qthreads &, QthreadsExecFunctionPointer, const void * );
+
+  /** Barrier across all workers participating in the 'exec_all'. */
+  void exec_all_barrier() const
+  {
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  /** Barrier across workers within the shepherd with rank < team_rank. */
+  void shepherd_barrier( const int team_size ) const
+  {
+    if ( m_shepherd_worker_rank < team_size ) {
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n, j;
+
+      for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+        Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadsExec::Inactive;
+        Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+      }
+
+      for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+      }
+    }
+  }
+
+  /** Reduce across all workers participating in the 'exec_all'. */
+  template< class FunctorType, class ReducerType, class ArgTag >
+  inline
+  void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
+  {
+    typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
+    typedef typename ReducerConditional::type ReducerTypeFwd;
+    typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin;
+
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      const QthreadsExec & fan = *m_worker_base[j];
+
+      Impl::spinwait_while_equal( fan.m_worker_state, QthreadsExec::Active );
+
+      ValueJoin::join( ReducerConditional::select( func, reduce ), m_scratch_alloc, fan.m_scratch_alloc );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  /** Scan across all workers participating in the 'exec_all'. */
+  template< class FunctorType, class ArgTag >
+  inline
+  void exec_all_scan( const FunctorType & func ) const
+  {
+    typedef Kokkos::Impl::FunctorValueInit< FunctorType, ArgTag > ValueInit;
+    typedef Kokkos::Impl::FunctorValueJoin< FunctorType, ArgTag > ValueJoin;
+    typedef Kokkos::Impl::FunctorValueOps<  FunctorType, ArgTag > ValueOps;
+
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      // Root thread scans across values before releasing threads.
+      // Worker data is in reverse order, so m_worker_base[0] is the
+      // highest ranking thread.
+
+      // Copy from lower ranking to higher ranking worker.
+      for ( int i = 1; i < m_worker_size; ++i ) {
+        ValueOps::copy( func
+                      , m_worker_base[i-1]->m_scratch_alloc
+                      , m_worker_base[i]->m_scratch_alloc
+                      );
+      }
+
+      ValueInit::init( func, m_worker_base[m_worker_size-1]->m_scratch_alloc );
+
+      // Join from lower ranking to higher ranking worker.
+      // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
+      for ( int i = m_worker_size - 1; --i > 0; ) {
+        ValueJoin::join( func, m_worker_base[i-1]->m_scratch_alloc, m_worker_base[i]->m_scratch_alloc );
+      }
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  //----------------------------------------
+
+  template< class Type >
+  inline
+  volatile Type * shepherd_team_scratch_value() const
+  { return (volatile Type*)( ( (unsigned char *) m_scratch_alloc ) + m_reduce_end ); }
+
+  template< class Type >
+  inline
+  void shepherd_broadcast( Type & value, const int team_size, const int team_rank ) const
+  {
+    if ( m_shepherd_base ) {
+      Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value; }
+      memory_fence();
+      shepherd_barrier( team_size );
+      value = *shared_value;
+    }
+  }
+
+  template< class Type >
+  inline
+  Type shepherd_reduce( const int team_size, const Type & value ) const
+  {
+    volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
+    *shared_value = value;
+//    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < n; ++i ) {
+        accum += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+      }
+      for ( int i = 1; i < n; ++i ) {
+        *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  template< class JoinOp >
+  inline
+  typename JoinOp::value_type
+  shepherd_reduce( const int team_size
+                 , const typename JoinOp::value_type & value
+                 , const JoinOp & op ) const
+  {
+    typedef typename JoinOp::value_type Type;
+
+    volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
+    *shared_value = value;
+//    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      volatile Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < team_size; ++i ) {
+        op.join( accum, *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
+      }
+      for ( int i = 1; i < team_size; ++i ) {
+        *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  template< class Type >
+  inline
+  Type shepherd_scan( const int team_size
+                    , const Type & value
+                    ,       Type * const global_value = 0 ) const
+  {
+    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      // Root thread scans across values before releasing threads.
+      // Worker data is in reverse order, so m_shepherd_base[0] is the
+      // highest ranking thread.
+
+      // Copy from lower ranking to higher ranking worker.
+
+      Type accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < team_size; ++i ) {
+        const Type tmp = *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+        accum += tmp;
+        *m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp;
+      }
+
+      *m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
+        global_value ? atomic_fetch_add( global_value, accum ) : 0;
+
+      // Join from lower ranking to higher ranking worker.
+      for ( int i = team_size; --i; ) {
+        *m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  //----------------------------------------
+
+  static inline
+  int align_alloc( int size )
+  {
+    enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */ };
+    enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
+    return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK;
+  }
+
+  void shared_reset( Qthreads::scratch_memory_space & );
+
+  void * exec_all_reduce_value() const { return m_scratch_alloc; }
+
+  static void * exec_all_reduce_result();
+
+  static void resize_worker_scratch( const int reduce_size, const int shared_size );
+  static void clear_workers();
+
+  //----------------------------------------
+
+  inline int worker_rank() const { return m_worker_rank; }
+  inline int worker_size() const { return m_worker_size; }
+  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank; }
+  inline int shepherd_worker_size() const { return m_shepherd_worker_size; }
+  inline int shepherd_rank() const { return m_shepherd_rank; }
+  inline int shepherd_size() const { return m_shepherd_size; }
+
+  static int worker_per_shepherd();
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+class QthreadsTeamPolicyMember {
+private:
+  typedef Kokkos::Qthreads                       execution_space;
+  typedef execution_space::scratch_memory_space  scratch_memory_space;
+
+  Impl::QthreadsExec   & m_exec;
+  scratch_memory_space   m_team_shared;
+  const int              m_team_size;
+  const int              m_team_rank;
+  const int              m_league_size;
+  const int              m_league_end;
+        int              m_league_rank;
+
+public:
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const { return m_team_shared; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  {}
+#else
+  { m_exec.shepherd_barrier( m_team_size ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value, int rank ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_broadcast<Type>( value, m_team_size, rank ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_reduce<Type>( m_team_size, value ); }
+#endif
+
+  template< typename JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+  team_reduce( const typename JoinOp::value_type & value
+             , const JoinOp & op ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return typename JoinOp::value_type(); }
+#else
+  { return m_exec.template shepherd_reduce<JoinOp>( m_team_size, value, op ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_scan<Type>( m_team_size, value ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the league's
+   *  parallel execution, be the scan's total.  Parallel execution ordering of
+   *  the league's teams is non-deterministic.  As such the base value for each
+   *  team's scan operation is similarly non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value, Type * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_scan<Type>( m_team_size, value, global_accum ); }
+#endif
+
+  //----------------------------------------
+  // Private driver for task-team parallel.
+
+  struct TaskTeam {};
+
+  QthreadsTeamPolicyMember();
+  explicit QthreadsTeamPolicyMember( const TaskTeam & );
+
+  //----------------------------------------
+  // Private for the driver ( for ( member_type i( exec, team ); i; i.next_team() ) { ... }
+
+  // Initialize.
+  template< class ... Properties >
+  QthreadsTeamPolicyMember( Impl::QthreadsExec & exec
+                          , const Kokkos::Impl::TeamPolicyInternal< Qthreads, Properties... > & team )
+    : m_exec( exec )
+    , m_team_shared( 0, 0 )
+    , m_team_size( team.m_team_size )
+    , m_team_rank( exec.shepherd_worker_rank() )
+    , m_league_size( team.m_league_size )
+    , m_league_end( team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
+    , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
+  {
+    m_exec.shared_reset( m_team_shared );
+  }
+
+  // Continue.
+  operator bool () const { return m_league_rank < m_league_end; }
+
+  // Iterate.
+  void next_team() { ++m_league_rank; m_exec.shared_reset( m_team_shared ); }
+};
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Qthreads, Properties ... >
+  : public PolicyTraits< Properties... >
+{
+private:
+  const int m_league_size;
+  const int m_team_size;
+  const int m_shepherd_iter;
+
+public:
+  //! Tag this class as a kokkos execution policy.
+  typedef TeamPolicyInternal              execution_policy;
+  typedef Qthreads                        execution_space;
+  typedef PolicyTraits< Properties ... >  traits;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+  { return Qthreads::instance().shepherd_worker_size(); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & f )
+  { return team_size_max( f ); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & f, const int& )
+  { return team_size_max( f ); }
+
+  //----------------------------------------
+
+  inline int team_size()   const { return m_team_size; }
+  inline int league_size() const { return m_league_size; }
+
+  // One active team per shepherd.
+  TeamPolicyInternal( Kokkos::Qthreads & q
+                    , const int league_size
+                    , const int team_size
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( team_size < q.shepherd_worker_size()
+                 ? team_size : q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+  {}
+
+  // TODO: Make sure this is correct.
+  // One active team per shepherd.
+  TeamPolicyInternal( Kokkos::Qthreads & q
+                    , const int league_size
+                    , const Kokkos::AUTO_t & /* team_size_request */
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+  {}
+
+  // One active team per shepherd.
+  TeamPolicyInternal( const int league_size
+                    , const int team_size
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( team_size < Qthreads::instance().shepherd_worker_size()
+                 ? team_size : Qthreads::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
+  {}
+
+  // TODO: Make sure this is correct.
+  // One active team per shepherd.
+  TeamPolicyInternal( const int league_size
+                    , const Kokkos::AUTO_t & /* team_size_request */
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( Qthreads::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
+  {}
+
+  // TODO: Doesn't do anything yet.  Fix this.
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+//    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  typedef Impl::QthreadsTeamPolicyMember member_type;
+
+  friend class Impl::QthreadsTeamPolicyMember;
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif
+#endif // #define KOKKOS_QTHREADSEXEC_HPP
+
diff --git a/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..00580a4933897148db0e41af033ca422df311d08
--- /dev/null
+++ b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
@@ -0,0 +1,734 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADS_PARALLEL_HPP
+#define KOKKOS_QTHREADS_PARALLEL_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS )
+
+#include <vector>
+
+#include <Kokkos_Parallel.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#include <Qthreads/Kokkos_QthreadsExec.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Traits ... >
+                 , Kokkos::Qthreads
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... >  Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::WorkRange    WorkRange ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor , const Member ibeg , const Member iend )
+    {
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor , const Member ibeg , const Member iend )
+    {
+      const TagType t{} ;
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i );
+      }
+    }
+
+  // Function is called once by every concurrent thread.
+  static void exec( QthreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    const WorkRange range( self.m_policy, exec.worker_rank(), exec.worker_size() );
+
+    ParallelFor::template exec_range< WorkTag > ( self.m_functor , range.begin() , range.end() );
+
+    // All threads wait for completion.
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );
+
+    }
+
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy
+             )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    { }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ReducerType , class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Qthreads
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... >  Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, WorkTag, void >::type WorkTagFwd;
+
+  // Static Assert WorkTag void if ReducerType not InvalidType
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTagFwd > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update )
+    {
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i , update );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update )
+    {
+      const TagType t{} ;
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i , update );
+      }
+    }
+
+  static void exec( QthreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    const WorkRange range( self.m_policy, exec.worker_rank(), exec.worker_size() );
+
+    ParallelReduce::template exec_range< WorkTag >(
+      self.m_functor, range.begin(), range.end(),
+      ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer)
+                     , exec.exec_all_reduce_value() ) );
+
+    exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      QthreadsExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );
+
+      const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , data );
+
+      if ( m_result_ptr ) {
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
+      }
+    }
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ViewType & arg_result_view
+                , typename std::enable_if<Kokkos::is_view< ViewType >::value &&
+                                          !Kokkos::is_reducer_type< ReducerType >::value
+                                          , void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.data() )
+    { }
+
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr( reducer.result_view().data() )
+    { }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , TeamPolicy< Properties ... >
+                 , Kokkos::Qthreads >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag     WorkTag ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      while ( member ) {
+        functor( member );
+        member.team_barrier();
+        member.next_team();
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      const TagType t{} ;
+      while ( member ) {
+        functor( t , member );
+        member.team_barrier();
+        member.next_team();
+      }
+    }
+
+  static void exec( QthreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    ParallelFor::template exec_team< WorkTag >
+      ( self.m_functor , Member( exec , self.m_policy ) );
+
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      QthreadsExec::resize_worker_scratch
+        ( /* reduction   memory */ 0
+        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );
+    }
+
+  ParallelFor( const FunctorType & arg_functor ,
+               const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    { }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ReducerType , class ... Properties >
+class ParallelReduce< FunctorType
+                    , TeamPolicy< Properties... >
+                    , ReducerType
+                    , Kokkos::Qthreads
+                    >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, WorkTag, void >::type WorkTagFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTagFwd >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const ReducerType  m_reducer ;
+  const pointer_type m_result_ptr ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member , reference_type update )
+    {
+      while ( member ) {
+        functor( member , update );
+        member.team_barrier();
+        member.next_team();
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member , reference_type update )
+    {
+      const TagType t{} ;
+      while ( member ) {
+        functor( t , member , update );
+        member.team_barrier();
+        member.next_team();
+      }
+    }
+
+  static void exec( QthreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    ParallelReduce::template exec_team< WorkTag >
+      ( self.m_functor
+      , Member( exec , self.m_policy )
+      , ValueInit::init( ReducerConditional::select( self.m_functor , self.m_reducer )
+                       , exec.exec_all_reduce_value() ) );
+
+    exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      QthreadsExec::resize_worker_scratch
+        ( /* reduction   memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
+        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
+
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );
+
+      const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer), data );
+
+      if ( m_result_ptr ) {
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
+      }
+    }
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType & arg_functor
+                , const Policy      & arg_policy
+                , const ViewType    & arg_result
+                , typename std::enable_if<Kokkos::is_view< ViewType >::value &&
+                                          !Kokkos::is_reducer_type< ReducerType >::value
+                                          , void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result.ptr_on_device() )
+    { }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy( arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.result_view().data() )
+  { }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::Qthreads
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... >  Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i , update , final );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      const TagType t{} ;
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i , update , final );
+      }
+    }
+
+  static void exec( QthreadsExec & exec , const void * arg )
+  {
+    const ParallelScan & self = * ((const ParallelScan *) arg );
+
+    const WorkRange range( self.m_policy , exec.worker_rank() , exec.worker_size() );
+
+    // Initialize thread-local value
+    reference_type update = ValueInit::init( self.m_functor , exec.exec_all_reduce_value() );
+
+    ParallelScan::template exec_range< WorkTag >( self.m_functor, range.begin() , range.end() , update , false );
+
+    exec.template exec_all_scan< FunctorType , typename Policy::work_tag >( self.m_functor );
+
+    ParallelScan::template exec_range< WorkTag >( self.m_functor , range.begin() , range.end() , update , true );
+
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      QthreadsExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelScan::exec , this );
+    }
+
+  ParallelScan( const FunctorType & arg_functor
+              , const Policy      & arg_policy
+              )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    {
+    }
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< typename iType >
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >
+TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType& count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, count );
+}
+
+template< typename iType1, typename iType2 >
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
+                                       Impl::QthreadsTeamPolicyMember >
+TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType1 & begin, const iType2 & end )
+{
+  typedef typename std::common_type< iType1, iType2 >::type iType;
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, iType(begin), iType(end) );
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >
+  ThreadVectorRange(const Impl::QthreadsTeamPolicyMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember> PerTeam(const Impl::QthreadsTeamPolicyMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember> PerThread(const Impl::QthreadsTeamPolicyMember& thread) {
+  return Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
+}
+
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+
+} // namespace Kokkos
+
+#endif
+#endif /* #define KOKKOS_QTHREADS_PARALLEL_HPP */
+
diff --git a/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52f57f5e45a205f80fd1effb21c5033b15bc08f0
--- /dev/null
+++ b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
@@ -0,0 +1,320 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Qthreads > ;
+
+//----------------------------------------------------------------------------
+
+TaskExec< Kokkos::Qthreads >::TaskExec()
+  : m_self_exec( 0 ),
+    m_team_exec( 0 ),
+    m_sync_mask( 0 ),
+    m_sync_value( 0 ),
+    m_sync_step( 0 ),
+    m_group_rank( 0 ),
+    m_team_rank( 0 ),
+    m_team_size( 1 )
+{}
+
+TaskExec< Kokkos::Qthreads >::
+TaskExec( Kokkos::Impl::QthreadsExec & arg_exec, int const arg_team_size )
+  : m_self_exec( & arg_exec ),
+    m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) ),
+    m_sync_mask( 0 ),
+    m_sync_value( 0 ),
+    m_sync_step( 0 ),
+    m_group_rank( arg_exec.pool_rank_rev() / arg_team_size ),
+    m_team_rank( arg_exec.pool_rank_rev() % arg_team_size ),
+    m_team_size( arg_team_size )
+{
+  // This team spans
+  //    m_self_exec->pool_rev( team_size * group_rank )
+  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
+
+  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
+
+  sync[0] = int64_t(0) ;
+  sync[1] = int64_t(0) ;
+
+  for ( int i = 0 ; i < m_team_size ; ++i ) {
+    m_sync_value |= int64_t(1) << (8*i);
+    m_sync_mask  |= int64_t(3) << (8*i);
+  }
+
+  Kokkos::memory_fence();
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void TaskExec< Kokkos::Qthreads >::team_barrier() const
+{
+  if ( 1 < m_team_size ) {
+
+    if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
+      Kokkos::abort("TaskQueue<Qthreads> scratch_reduce memory too small");
+    }
+
+    // Use team shared memory to synchronize.
+    // Alternate memory locations between barriers to avoid a sequence
+    // of barriers overtaking one another.
+
+    int64_t volatile * const sync =
+      ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
+
+    // This team member sets one byte within the sync variable
+    int8_t volatile * const sync_self =
+     ((int8_t *) sync) + m_team_rank ;
+
+#if 0
+fprintf( stdout,
+         "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n",
+         m_group_rank,
+         m_team_rank,
+         m_sync_step,
+         m_sync_value,
+         *sync
+       );
+fflush(stdout);
+#endif
+
+    *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
+
+    while ( m_sync_value != *sync ); // wait for team to arrive
+
+#if 0
+fprintf( stdout,
+         "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n",
+         m_group_rank,
+         m_team_rank,
+         m_sync_step,
+         m_sync_value,
+         *sync
+       );
+fflush(stdout);
+#endif
+
+    ++m_sync_step ;
+
+    if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
+      m_sync_value ^= m_sync_mask ;
+      if ( 1000 < m_sync_step ) m_sync_step = 0 ;
+    }
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+void TaskQueueSpecialization< Kokkos::Qthreads >::execute
+  ( TaskQueue< Kokkos::Qthreads > * const queue )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space, void, void > ;
+  using PoolExec        = Kokkos::Impl::QthreadsExec ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  // Required:  team_size <= 8
+
+  const int team_size = PoolExec::pool_size(2); // Threads per core
+  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+
+  if ( 8 < team_size ) {
+    Kokkos::abort("TaskQueue<Qthreads> unsupported team size");
+  }
+
+#pragma omp parallel
+  {
+    PoolExec & self = *PoolExec::get_thread_omp();
+
+    Member single_exec ;
+    Member team_exec( self, team_size );
+
+    // Team shared memory
+    task_root_type * volatile * const task_shared =
+      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+
+// Barrier across entire Qthreads thread pool to insure initialization
+#pragma omp barrier
+
+    // Loop until all queues are empty and no tasks in flight
+
+    do {
+
+      // Each team lead attempts to acquire either a thread team task
+      // or collection of single thread tasks for the team.
+
+      if ( 0 == team_exec.team_rank() ) {
+
+        task_root_type * tmp =
+          0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+        // Loop by priority and then type
+        for ( int i = 0 ; i < queue_type::NumQueue && end == tmp ; ++i ) {
+          for ( int j = 0 ; j < 2 && end == tmp ; ++j ) {
+            tmp = queue_type::pop_task( & queue->m_ready[i][j] );
+          }
+        }
+
+        *task_shared = tmp ;
+
+        // Fence to be sure shared_task_array is stored
+        Kokkos::memory_fence();
+      }
+
+      // Whole team waits for every team member to reach this statement
+      team_exec.team_barrier();
+
+      Kokkos::memory_fence();
+
+      task_root_type * const task = *task_shared ;
+
+#if 0
+fprintf( stdout,
+         "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n",
+         team_exec.m_group_rank,
+         team_exec.m_team_rank,
+         uintptr_t(task_shared),
+         uintptr_t(task)
+       );
+fflush(stdout);
+#endif
+
+      if ( 0 == task ) break ; // 0 == m_ready_count
+
+      if ( end == task ) {
+        team_exec.team_barrier();
+      }
+      else if ( task_root_type::TaskTeam == task->m_task_type ) {
+        // Thread Team Task
+        (*task->m_apply)( task, & team_exec );
+
+        // The m_apply function performs a barrier
+
+        if ( 0 == team_exec.team_rank() ) {
+          // team member #0 completes the task, which may delete the task
+          queue->complete( task );
+        }
+      }
+      else {
+        // Single Thread Task
+
+        if ( 0 == team_exec.team_rank() ) {
+
+          (*task->m_apply)( task, & single_exec );
+
+          queue->complete( task );
+        }
+
+        // All team members wait for whole team to reach this statement.
+        // Not necessary to complete the task.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+    } while(1);
+  }
+// END #pragma omp parallel
+
+}
+
+void TaskQueueSpecialization< Kokkos::Qthreads >::
+  iff_single_thread_recursive_execute
+    ( TaskQueue< Kokkos::Qthreads > * const queue )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space, void, void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  if ( 1 == omp_get_num_threads() ) {
+
+    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+    Member single_exec ;
+
+    task_root_type * task = end ;
+
+    do {
+
+      task = end ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+          task = queue_type::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+      if ( end == task ) break ;
+
+      (*task->m_apply)( task, & single_exec );
+
+      queue->complete( task );
+
+    } while(1);
+  }
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+#else
+void KOKKOS_SRC_QTHREADS_TASK_PREVENT_LINK_ERROR() {}
+#endif /* #if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
diff --git a/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..35fc6599779b134e5a8844ac8295f7dd8f7d2a25
--- /dev/null
+++ b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
@@ -0,0 +1,157 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP
+#define KOKKOS_IMPL_QTHREADS_TASK_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueueSpecialization< Kokkos::Qthreads >
+{
+public:
+
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space, void, void > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  // Must provide mechanism to set function pointer in
+  // execution space from the host process.
+  template< typename FunctorType >
+  static
+  void proc_set_apply( task_base_type::function_type * ptr )
+    {
+      using TaskType = TaskBase< execution_space,
+                                 typename FunctorType::value_type,
+                                 FunctorType
+                               > ;
+       *ptr = TaskType::apply ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::Qthreads > ;
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskExec< Kokkos::Qthreads >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+
+  using PoolExec = Kokkos::Impl::QthreadsExec ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Qthreads > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Qthreads > ;
+
+  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure
+  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
+  int64_t          m_sync_mask ;
+  int64_t mutable  m_sync_value ;
+  int     mutable  m_sync_step ;
+  int              m_group_rank ; ///< Which "team" subset of thread pool
+  int              m_team_rank ;  ///< Which thread within a team
+  int              m_team_size ;
+
+  TaskExec();
+  TaskExec( PoolExec & arg_exec, int arg_team_size );
+
+public:
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  void * team_shared() const
+    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
+
+  int team_shared_size() const
+    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
+
+  /**\brief  Whole team enters this function call
+   *         before any teeam member returns from
+   *         this function call.
+   */
+  void team_barrier() const ;
+#else
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const { return m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const { return m_team_size ; }
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP */
+
diff --git a/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
new file mode 100644
index 0000000000000000000000000000000000000000..a59afb2881e9f6771f8b9808af07ed6f54e25496
--- /dev/null
+++ b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
@@ -0,0 +1,493 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD.
+
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS )
+
+#include <Kokkos_Core_fwd.hpp>
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include <Kokkos_Atomic.hpp>
+#include <Qthreads/Kokkos_Qthreads_TaskPolicy.hpp>
+
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+typedef TaskMember< Kokkos::Qthreads , void , void > Task ;
+
+namespace {
+
+inline
+unsigned padded_sizeof_derived( unsigned sizeof_derived )
+{
+  return sizeof_derived +
+    ( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 );
+}
+
+// int lock_alloc_dealloc = 0 ;
+
+} // namespace
+
+void Task::deallocate( void * ptr )
+{
+  // Counting on 'free' thread safety so lock/unlock not required.
+  // However, isolate calls here to mitigate future need to introduce lock/unlock.
+
+  // lock
+
+  // while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) );
+
+  free( ptr );
+
+  // unlock
+
+  // Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 );
+}
+
+void * Task::allocate( const unsigned arg_sizeof_derived
+                     , const unsigned arg_dependence_capacity )
+{
+  // Counting on 'malloc' thread safety so lock/unlock not required.
+  // However, isolate calls here to mitigate future need to introduce lock/unlock.
+
+  // lock
+
+  // while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) );
+
+  void * const ptr = malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) );
+
+  // unlock
+
+  // Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 );
+
+  return ptr ;
+}
+
+Task::~TaskMember()
+{
+
+}
+
+
+Task::TaskMember( const function_verify_type   arg_verify
+                , const function_dealloc_type  arg_dealloc
+                , const function_single_type   arg_apply_single
+                , const function_team_type     arg_apply_team
+                , volatile int &               arg_active_count
+                , const unsigned               arg_sizeof_derived
+                , const unsigned               arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  arg_verify )
+  , m_apply_single( arg_apply_single )
+  , m_apply_team( arg_apply_team )
+  , m_active_count( & arg_active_count )
+  , m_qfeb(0)
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING )
+{
+  qthread_empty( & m_qfeb ); // Set to full when complete
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+Task::TaskMember( const function_dealloc_type  arg_dealloc
+                , const function_single_type   arg_apply_single
+                , const function_team_type     arg_apply_team
+                , volatile int &               arg_active_count
+                , const unsigned               arg_sizeof_derived
+                , const unsigned               arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  & Task::verify_type<void> )
+  , m_apply_single( arg_apply_single )
+  , m_apply_team( arg_apply_team )
+  , m_active_count( & arg_active_count )
+  , m_qfeb(0)
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING )
+{
+  qthread_empty( & m_qfeb ); // Set to full when complete
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+void Task::throw_error_add_dependence() const
+{
+  std::cerr << "TaskMember< Qthreads >::add_dependence ERROR"
+            << " state(" << m_state << ")"
+            << " dep_size(" << m_dep_size << ")"
+            << std::endl ;
+  throw std::runtime_error("TaskMember< Qthreads >::add_dependence ERROR");
+}
+
+void Task::throw_error_verify_type()
+{
+  throw std::runtime_error("TaskMember< Qthreads >::verify_type ERROR");
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
+{
+  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Qthreads>::assign ERROR" ;
+  static const char msg_error_count[]       = ": negative reference count" ;
+  static const char msg_error_complete[]    = ": destroy task that is not complete" ;
+  static const char msg_error_dependences[] = ": destroy task that has dependences" ;
+  static const char msg_error_exception[]   = ": caught internal exception" ;
+
+  if ( rhs ) { Kokkos::atomic_increment( &(*rhs).m_ref_count ); }
+
+  Task * const lhs_val = Kokkos::atomic_exchange( lhs , rhs );
+
+  if ( lhs_val ) {
+
+    const int count = Kokkos::atomic_fetch_add( & (*lhs_val).m_ref_count , -1 );
+
+    const char * msg_error = 0 ;
+
+    try {
+
+      if ( 1 == count ) {
+
+        // Reference count at zero, delete it
+
+        // Should only be deallocating a completed task
+        if ( (*lhs_val).m_state == Kokkos::Experimental::TASK_STATE_COMPLETE ) {
+
+          // A completed task should not have dependences...
+          for ( int i = 0 ; i < (*lhs_val).m_dep_size && 0 == msg_error ; ++i ) {
+            if ( (*lhs_val).m_dep[i] ) msg_error = msg_error_dependences ;
+          }
+        }
+        else {
+          msg_error = msg_error_complete ;
+        }
+
+        if ( 0 == msg_error ) {
+          // Get deletion function and apply it
+          const Task::function_dealloc_type d = (*lhs_val).m_dealloc ;
+
+          (*d)( lhs_val );
+        }
+      }
+      else if ( count <= 0 ) {
+        msg_error = msg_error_count ;
+      }
+    }
+    catch( ... ) {
+      if ( 0 == msg_error ) msg_error = msg_error_exception ;
+    }
+
+    if ( 0 != msg_error ) {
+      if ( no_throw ) {
+        std::cerr << msg_error_header << msg_error << std::endl ;
+        std::cerr.flush();
+      }
+      else {
+        std::string msg(msg_error_header);
+        msg.append(msg_error);
+        throw std::runtime_error( msg );
+      }
+    }
+  }
+}
+#endif
+
+
+//----------------------------------------------------------------------------
+
+void Task::closeout()
+{
+  enum { RESPAWN = int( Kokkos::Experimental::TASK_STATE_WAITING ) |
+                   int( Kokkos::Experimental::TASK_STATE_EXECUTING ) };
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx %s\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(this)
+       , ( m_state == RESPAWN ? "respawn" : "complete" )
+       );
+fflush(stdout);
+#endif
+
+  // When dependent tasks run there would be a race
+  // condition between destroying this task and
+  // querying the active count pointer from this task.
+  int volatile * const active_count = m_active_count ;
+
+  if ( m_state == RESPAWN ) {
+    // Task requests respawn, set state to waiting and reschedule the task
+    m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
+    schedule();
+  }
+  else {
+
+    // Task did not respawn, is complete
+    m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
+
+    // Release dependences before allowing dependent tasks to run.
+    // Otherwise there is a thread race condition for removing dependences.
+    for ( int i = 0 ; i < m_dep_size ; ++i ) {
+      assign( & m_dep[i] , 0 );
+    }
+
+    // Set Qthreads FEB to full so that dependent tasks are allowed to execute.
+    // This 'task' may be deleted immediately following this function call.
+    qthread_fill( & m_qfeb );
+
+    // The dependent task could now complete and destroy 'this' task
+    // before the call to 'qthread_fill' returns.  Therefore, for
+    // thread safety assume that 'this' task has now been destroyed.
+  }
+
+  // Decrement active task count before returning.
+  Kokkos::atomic_decrement( active_count );
+}
+
+aligned_t Task::qthread_func( void * arg )
+{
+  Task * const task = reinterpret_cast< Task * >(arg);
+
+  // First member of the team change state to executing.
+  // Use compare-exchange to avoid race condition with a respawn.
+  Kokkos::atomic_compare_exchange_strong( & task->m_state
+                                        , int(Kokkos::Experimental::TASK_STATE_WAITING)
+                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        );
+
+  if ( task->m_apply_team && ! task->m_apply_single ) {
+    Kokkos::Impl::QthreadsTeamPolicyMember::TaskTeam task_team_tag ;
+
+    // Initialize team size and rank with shephered info
+    Kokkos::Impl::QthreadsTeamPolicyMember member( task_team_tag );
+
+    (*task->m_apply_team)( task , member );
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       , member.team_rank()
+       , member.team_size()
+       );
+fflush(stdout);
+#endif
+
+    member.team_barrier();
+    if ( member.team_rank() == 0 ) task->closeout();
+    member.team_barrier();
+  }
+  else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
+    // Team hard-wired to one, no cloning
+    Kokkos::Impl::QthreadsTeamPolicyMember member ;
+    (*task->m_apply_team)( task , member );
+    task->closeout();
+  }
+  else {
+    (*task->m_apply_single)( task );
+    task->closeout();
+  }
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx return\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       );
+fflush(stdout);
+#endif
+
+  return 0 ;
+}
+
+void Task::respawn()
+{
+  // Change state from pure executing to ( waiting | executing )
+  // to avoid confusion with simply waiting.
+  Kokkos::atomic_compare_exchange_strong( & m_state
+                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        , int(Kokkos::Experimental::TASK_STATE_WAITING |
+                                              Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        );
+}
+
+void Task::schedule()
+{
+  // Is waiting for execution
+
+  // Increment active task count before spawning.
+  Kokkos::atomic_increment( m_active_count );
+
+  // spawn in Qthreads.  must malloc the precondition array and give to Qthreads.
+  // Qthreads will eventually free this allocation so memory will not be leaked.
+
+  // concern with thread safety of malloc, does this need to be guarded?
+  aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );
+
+  qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
+
+  for ( int i = 0 ; i < m_dep_size ; ++i ) {
+    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthreads precondition flag
+  }
+
+  if ( m_apply_team && ! m_apply_single ) {
+    // If more than one shepherd spawn on a shepherd other than this shepherd
+    const int num_shepherd            = qthread_num_shepherds();
+    const int num_worker_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
+    const int this_shepherd           = qthread_shep();
+
+    int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ;
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(this)
+       , spawn_shepherd
+       , num_worker_per_shepherd - 1
+       );
+fflush(stdout);
+#endif
+
+    qthread_spawn_cloneable
+      ( & Task::qthread_func
+      , this
+      , 0
+      , NULL
+      , m_dep_size , qprecon /* dependences */
+      , spawn_shepherd
+      , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY )
+      , num_worker_per_shepherd - 1
+      );
+  }
+  else {
+    qthread_spawn( & Task::qthread_func /* function */
+                 , this                 /* function argument */
+                 , 0
+                 , NULL
+                 , m_dep_size , qprecon /* dependences */
+                 , NO_SHEPHERD
+                 , QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+                 );
+  }
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+TaskPolicy< Kokkos::Qthreads >::
+TaskPolicy
+  ( const unsigned /* arg_task_max_count */
+  , const unsigned /* arg_task_max_size */
+  , const unsigned arg_task_default_dependence_capacity
+  , const unsigned arg_task_team_size
+  )
+  : m_default_dependence_capacity( arg_task_default_dependence_capacity )
+  , m_team_size( arg_task_team_size != 0 ? arg_task_team_size : unsigned(qthread_num_workers_local(NO_SHEPHERD)) )
+  , m_active_count_root(0)
+  , m_active_count( m_active_count_root )
+{
+  const unsigned num_worker_per_shepherd = unsigned( qthread_num_workers_local(NO_SHEPHERD) );
+
+  if ( m_team_size != 1 && m_team_size != num_worker_per_shepherd ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads >( "
+        << "default_depedence = " << arg_task_default_dependence_capacity
+        << " , team_size = " << arg_task_team_size
+        << " ) ERROR, valid team_size arguments are { (omitted) , 1 , " << num_worker_per_shepherd << " }" ;
+    Kokkos::Impl::throw_runtime_exception(msg.str());
+  }
+}
+
+TaskPolicy< Kokkos::Qthreads >::member_type &
+TaskPolicy< Kokkos::Qthreads >::member_single()
+{
+  static member_type s ;
+  return s ;
+}
+
+void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads > & policy )
+{
+  volatile int * const active_task_count = & policy.m_active_count ;
+  while ( *active_task_count ) qthread_yield();
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#else
+void KOKKOS_CORE_SRC_QTHREADS_KOKKOS_QTHREADS_TASKPOLICY_PREVENT_LINK_ERROR() {}
+#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
+#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
+
diff --git a/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
new file mode 100644
index 0000000000000000000000000000000000000000..adb6859763d39fbded63fdf476a6b04f639241cf
--- /dev/null
+++ b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
@@ -0,0 +1,666 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_QTHREADS_TASKSCHEDULER_HPP
+#define KOKKOS_QTHREADS_TASKSCHEDULER_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+//----------------------------------------------------------------------------
+// Defines to enable experimental Qthreads functionality
+
+#define QTHREAD_LOCAL_PRIORITY
+#define CLONED_TASKS
+
+#include <qthread.h>
+
+#undef QTHREAD_LOCAL_PRIORITY
+#undef CLONED_TASKS
+
+//----------------------------------------------------------------------------
+
+#include <Kokkos_Qthreads.hpp>
+#include <Kokkos_TaskScheduler.hpp>
+#include <Kokkos_View.hpp>
+
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+class TaskMember< Kokkos::Qthreads , void , void >
+{
+public:
+
+  typedef TaskMember * (* function_verify_type) ( TaskMember * );
+  typedef void         (* function_single_type) ( TaskMember * );
+  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::QthreadsTeamPolicyMember & );
+  typedef void         (* function_dealloc_type)( TaskMember * );
+
+private:
+
+  const function_dealloc_type  m_dealloc ;       ///< Deallocation
+  const function_verify_type   m_verify ;        ///< Result type verification
+  const function_single_type   m_apply_single ;  ///< Apply function
+  const function_team_type     m_apply_team ;    ///< Apply function
+  int volatile * const         m_active_count ;  ///< Count of active tasks on this policy
+  aligned_t                    m_qfeb ;          ///< Qthreads full/empty bit
+  TaskMember ** const          m_dep ;           ///< Dependences
+  const int                    m_dep_capacity ;  ///< Capacity of dependences
+  int                          m_dep_size ;      ///< Actual count of dependences
+  int                          m_ref_count ;     ///< Reference count
+  int                          m_state ;         ///< State of the task
+
+  TaskMember() /* = delete */ ;
+  TaskMember( const TaskMember & ) /* = delete */ ;
+  TaskMember & operator = ( const TaskMember & ) /* = delete */ ;
+
+  static aligned_t qthread_func( void * arg );
+
+  static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity );
+  static void   deallocate( void * );
+
+  void throw_error_add_dependence() const ;
+  static void throw_error_verify_type();
+
+  template < class DerivedTaskType >
+  static
+  void deallocate( TaskMember * t )
+    {
+      DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
+      ptr->~DerivedTaskType();
+      deallocate( (void *) ptr );
+    }
+
+  void schedule();
+  void closeout();
+
+protected :
+
+  ~TaskMember();
+
+  // Used by TaskMember< Qthreads , ResultType , void >
+  TaskMember( const function_verify_type   arg_verify
+            , const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            );
+
+  // Used for TaskMember< Qthreads , void , void >
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            );
+
+public:
+
+  template< typename ResultType >
+  KOKKOS_FUNCTION static
+  TaskMember * verify_type( TaskMember * t )
+    {
+      enum { check_type = ! std::is_same< ResultType , void >::value };
+
+      if ( check_type && t != 0 ) {
+
+        // Verify that t->m_verify is this function
+        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
+
+        if ( t->m_verify != self ) {
+          t = 0 ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+          throw_error_verify_type();
+#endif
+        }
+      }
+      return t ;
+    }
+
+  //----------------------------------------
+  /*  Inheritence Requirements on task types:
+   *    typedef  FunctorType::value_type  value_type ;
+   *    class DerivedTaskType
+   *      : public TaskMember< Qthreads , value_type , FunctorType >
+   *      { ... };
+   *    class TaskMember< Qthreads , value_type , FunctorType >
+   *      : public TaskMember< Qthreads , value_type , void >
+   *      , public Functor
+   *      { ... };
+   *  If value_type != void
+   *    class TaskMember< Qthreads , value_type , void >
+   *      : public TaskMember< Qthreads , void , void >
+   *
+   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
+   *
+   */
+
+  /** \brief  Allocate and construct a single-thread task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create_single( const typename DerivedTaskType::functor_type &  arg_functor
+                            , volatile int &                                  arg_active_count
+                            , const unsigned                                  arg_dependence_capacity )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , & TaskMember::template apply_single< functor_type , value_type >
+                         , 0
+                         , arg_active_count
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  /** \brief  Allocate and construct a team-thread task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create_team( const typename DerivedTaskType::functor_type &  arg_functor
+                          , volatile int &                                  arg_active_count
+                          , const unsigned                                  arg_dependence_capacity
+                          , const bool                                      arg_is_team )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      const function_single_type flag = reinterpret_cast<function_single_type>( arg_is_team ? 0 : 1 );
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , flag
+                         , & TaskMember::template apply_team< functor_type , value_type >
+                         , arg_active_count
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  void respawn();
+  void spawn()
+    {
+       m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
+       schedule();
+    }
+
+  //----------------------------------------
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return get_result_type() ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
+
+  //----------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false );
+#else
+  KOKKOS_INLINE_FUNCTION static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {}
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  TaskMember * get_dependence( int i ) const
+    { return ( Kokkos::Experimental::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence() const
+    { return m_dep_size ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence()
+    {
+      for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 );
+      m_dep_size = 0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskMember * before )
+    {
+      if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == m_state ||
+             Kokkos::Experimental::TASK_STATE_EXECUTING    == m_state ) &&
+           m_dep_size < m_dep_capacity ) {
+        assign( m_dep + m_dep_size , before );
+        ++m_dep_size ;
+      }
+      else {
+        throw_error_add_dependence();
+      }
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthreads , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthreads , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m );
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_team( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t
+                 , Kokkos::Impl::QthreadsTeamPolicyMember & member )
+    {
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member , m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_team( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t
+                 , Kokkos::Impl::QthreadsTeamPolicyMember & member )
+    {
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member );
+    }
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Base class for tasks with a result value in the Qthreads execution space.
+ *
+ *  The FunctorType must be void because this class is accessed by the
+ *  Future class for the task and result value.
+ *
+ *  Must be derived from TaskMember<S,void,void> 'root class' so the Future class
+ *  can correctly static_cast from the 'root class' to this class.
+ */
+template < class ResultType >
+class TaskMember< Kokkos::Qthreads , ResultType , void >
+  : public TaskMember< Kokkos::Qthreads , void , void >
+{
+public:
+
+  ResultType  m_result ;
+
+  typedef const ResultType & get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return m_result ; }
+
+protected:
+
+  typedef TaskMember< Kokkos::Qthreads , void , void >  task_root_type ;
+  typedef task_root_type::function_dealloc_type        function_dealloc_type ;
+  typedef task_root_type::function_single_type         function_single_type ;
+  typedef task_root_type::function_team_type           function_team_type ;
+
+  inline
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            )
+    : task_root_type( & task_root_type::template verify_type< ResultType >
+                    , arg_dealloc
+                    , arg_apply_single
+                    , arg_apply_team
+                    , arg_active_count
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity )
+    , m_result()
+    {}
+};
+
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Qthreads , ResultType , void >
+  , public FunctorType
+{
+public:
+
+  typedef FunctorType  functor_type ;
+
+  typedef TaskMember< Kokkos::Qthreads , void , void >        task_root_type ;
+  typedef TaskMember< Kokkos::Qthreads , ResultType , void >  task_base_type ;
+  typedef task_root_type::function_dealloc_type              function_dealloc_type ;
+  typedef task_root_type::function_single_type               function_single_type ;
+  typedef task_root_type::function_team_type                 function_team_type ;
+
+  inline
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            , const functor_type &         arg_functor
+            )
+    : task_base_type( arg_dealloc
+                    , arg_apply_single
+                    , arg_apply_team
+                    , arg_active_count
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity )
+    , functor_type( arg_functor )
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+void wait( TaskPolicy< Kokkos::Qthreads > & );
+
+template<>
+class TaskPolicy< Kokkos::Qthreads >
+{
+public:
+
+  typedef Kokkos::Qthreads                        execution_space ;
+  typedef TaskPolicy                             execution_policy ;
+  typedef Kokkos::Impl::QthreadsTeamPolicyMember  member_type ;
+
+private:
+
+  typedef Impl::TaskMember< execution_space , void , void > task_root_type ;
+
+  template< class FunctorType >
+  static inline
+  const task_root_type * get_task_root( const FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
+    }
+
+  template< class FunctorType >
+  static inline
+  task_root_type * get_task_root( FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
+    }
+
+  unsigned        m_default_dependence_capacity ;
+  unsigned        m_team_size ;
+  volatile int    m_active_count_root ;
+  volatile int &  m_active_count ;
+
+public:
+
+  TaskPolicy
+    ( const unsigned arg_task_max_count
+    , const unsigned arg_task_max_size
+    , const unsigned arg_task_default_dependence_capacity = 4
+    , const unsigned arg_task_team_size = 0 /* choose default */
+    );
+
+  KOKKOS_FUNCTION TaskPolicy() = default ;
+  KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count() const { return m_active_count ; }
+
+  template< class ValueType >
+  const Future< ValueType , execution_space > &
+    spawn( const Future< ValueType , execution_space > & f
+         , const bool priority = false ) const
+      {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        f.m_task->spawn();
+#endif
+        return f ;
+      }
+
+  // Create single-thread task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  task_create( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create_single< task_type >
+          ( functor
+          , m_active_count
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          )
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  proc_create( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    { return task_create( functor , dependence_capacity ); }
+
+  // Create thread-team task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  task_create_team( const FunctorType & functor
+                  , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create_team< task_type >
+          ( functor
+          , m_active_count
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          , 1 < m_team_size
+          )
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  proc_create_team( const FunctorType & functor
+                  , const unsigned dependence_capacity = ~0u ) const
+    { return task_create_team( functor , dependence_capacity ); }
+
+  // Add dependence
+  template< class A1 , class A2 , class A3 , class A4 >
+  void add_dependence( const Future<A1,A2> & after
+                     , const Future<A3,A4> & before
+                     , typename std::enable_if
+                        < std::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                          &&
+                          std::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      after.m_task->add_dependence( before.m_task );
+#endif
+    }
+
+  //----------------------------------------
+  // Functions for an executing task functor to query dependences,
+  // set new dependences, and respawn itself.
+
+  template< class FunctorType >
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      return Future<void,execution_space>(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        get_task_root(task_functor)->get_dependence(i)
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  int get_dependence( const FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return get_task_root(task_functor)->get_dependence(); }
+#else
+    { return 0 ; }
+#endif
+
+  template< class FunctorType >
+  void clear_dependence( FunctorType * task_functor ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->clear_dependence();
+#endif
+    }
+
+  template< class FunctorType , class A3 , class A4 >
+  void add_dependence( FunctorType * task_functor
+                     , const Future<A3,A4> & before
+                     , typename std::enable_if
+                        < std::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->add_dependence( before.m_task );
+#endif
+    }
+
+  template< class FunctorType >
+  void respawn( FunctorType * task_functor
+              , const bool priority = false ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->respawn();
+#endif
+    }
+
+  template< class FunctorType >
+  void respawn_needing_memory( FunctorType * task_functor ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->respawn();
+#endif
+    }
+
+  static member_type & member_single();
+
+  friend void wait( TaskPolicy< Kokkos::Qthreads > & );
+};
+
+} /* namespace Experimental */
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
+#endif /* #define KOKKOS_QTHREADS_TASK_HPP */
+
diff --git a/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..96d88af79f3f9404ca16d50f53feeb980cf1e978
--- /dev/null
+++ b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
@@ -0,0 +1,325 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADS_TASKQUEUE_HPP
+#define KOKKOS_QTHREADS_TASKQUEUE_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Manage task allocation, deallocation, and scheduling.
+ *
+ *  Task execution is handled here directly for the Qthread implementation.
+ */
+template<>
+class TaskQueue< Kokkos::Qthread > {
+private:
+
+  using execution_space = Kokkos::Qthread ;
+  using memory_space    = Kokkos::HostSpace
+  using device_type     = Kokkos::Device< execution_space, memory_space > ;
+  using memory_pool     = Kokkos::MemoryPool< device_type > ;
+  using task_root_type  = Kokkos::Impl::TaskBase< execution_space, void, void > ;
+
+  friend class Kokkos::TaskScheduler< execution_space > ;
+
+  struct Destroy {
+    TaskQueue * m_queue ;
+    void destroy_shared_allocation();
+  };
+
+  //----------------------------------------
+
+  enum : int { TASK_STATE_NULL         =  0,  ///<  Does not exist
+               TASK_STATE_CONSTRUCTING =  1,  ///<  Is under construction
+               TASK_STATE_WAITING      =  2,  ///<  Is waiting for execution
+               TASK_STATE_EXECUTING    =  4,  ///<  Is executing
+               TASK_STATE_RESPAWN      =  8,  ///<  Requested respawn
+               TASK_STATE_COMPLETE     = 16   ///<  Execution is complete
+             };
+
+  // Queue is organized as [ priority ][ type ]
+
+  memory_pool  m_memory ;
+  unsigned     m_team_size ;   // Number of threads in a team
+  long         m_accum_alloc ; // Accumulated number of allocations
+  int          m_count_alloc ; // Current number of allocations
+  int          m_max_alloc ;   // Maximum number of allocations
+  int          m_ready_count ; // Number of ready or executing
+
+  //----------------------------------------
+
+  ~TaskQueue();
+  TaskQueue() = delete ;
+  TaskQueue( TaskQueue && ) = delete ;
+  TaskQueue( TaskQueue const & ) = delete ;
+  TaskQueue & operator = ( TaskQueue && ) = delete ;
+  TaskQueue & operator = ( TaskQueue const & ) = delete ;
+
+  TaskQueue
+    ( const memory_space & arg_space,
+      unsigned const arg_memory_pool_capacity,
+      unsigned const arg_memory_pool_superblock_capacity_log2
+    );
+
+  // Schedule a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next is the dependence or zero
+  //   Postcondition:
+  //     task->m_next is linked list membership
+  KOKKOS_FUNCTION
+  void schedule( task_root_type * const );
+
+  // Reschedule a task
+  //   Precondition:
+  //     task is in Executing state
+  //     task->m_next == LockTag
+  //   Postcondition:
+  //     task is in Executing-Respawn state
+  //     task->m_next == 0 (no dependence)
+  KOKKOS_FUNCTION
+  void reschedule( task_root_type * );
+
+  // Complete a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next == LockTag  =>  task is complete
+  //     task->m_next != LockTag  =>  task is respawn
+  //   Postcondition:
+  //     task->m_wait == LockTag  =>  task is complete
+  //     task->m_wait != LockTag  =>  task is waiting
+  KOKKOS_FUNCTION
+  void complete( task_root_type * );
+
+public:
+
+  // If and only if the execution space is a single thread
+  // then execute ready tasks.
+  KOKKOS_INLINE_FUNCTION
+  void iff_single_thread_recursive_execute()
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      specialization::iff_single_thread_recursive_execute( this );
+#endif
+    }
+
+  void execute() { specialization::execute( this ); }
+
+  template< typename FunctorType >
+  void proc_set_apply( typename task_root_type::function_type * ptr )
+    {
+      specialization::template proc_set_apply< FunctorType >( ptr );
+    }
+
+  // Assign task pointer with reference counting of assigned tasks
+  template< typename LV, typename RV >
+  KOKKOS_FUNCTION static
+  void assign( TaskBase< execution_space, LV, void > ** const lhs,
+               TaskBase< execution_space, RV, void > *  const rhs )
+    {
+      using task_lhs = TaskBase< execution_space, LV, void > ;
+#if 0
+  {
+    printf( "assign( 0x%lx { 0x%lx %d %d }, 0x%lx { 0x%lx %d %d } )\n",
+            uintptr_t( lhs ? *lhs : 0 ),
+            uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 ),
+            int( lhs && *lhs ? (*lhs)->m_task_type : 0 ),
+            int( lhs && *lhs ? (*lhs)->m_ref_count : 0 ),
+            uintptr_t(rhs),
+            uintptr_t( rhs ? rhs->m_next : 0 ),
+            int( rhs ? rhs->m_task_type : 0 ),
+            int( rhs ? rhs->m_ref_count : 0 )
+          );
+    fflush( stdout );
+  }
+#endif
+
+      if ( *lhs )
+      {
+        const int count = Kokkos::atomic_fetch_add( &((*lhs)->m_ref_count), -1 );
+
+        if ( ( 1 == count ) && ( (*lhs)->m_state == TASK_STATE_COMPLETE ) ) {
+          // Reference count is zero and task is complete, deallocate.
+          (*lhs)->m_queue->deallocate( *lhs, (*lhs)->m_alloc_size );
+        }
+        else if ( count <= 1 ) {
+          Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" );
+        }
+
+        // GEM: Should I check that there are no dependences here?  Can the state
+        //      be set to complete while there are still dependences?
+      }
+
+      if ( rhs ) { Kokkos::atomic_fetch_add( &(rhs->m_ref_count), 1 ); }
+
+      // Force write of *lhs
+
+      *static_cast< task_lhs * volatile * >(lhs) = rhs ;
+
+      Kokkos::memory_fence();
+    }
+
+  KOKKOS_FUNCTION
+  size_t allocate_block_size( size_t n ); ///< Actual block size allocated
+
+  KOKKOS_FUNCTION
+  void * allocate( size_t n ); ///< Allocate from the memory pool
+
+  KOKKOS_FUNCTION
+  void deallocate( void * p, size_t n ); ///< Deallocate to the memory pool
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskBase< Kokkos::Qthread, void, void >
+{
+public:
+
+  enum : int16_t   { TaskTeam   = TaskBase< void, void, void >::TaskTeam,
+                     TaskSingle = TaskBase< void, void, void >::TaskSingle,
+                     Aggregate  = TaskBase< void, void, void >::Aggregate };
+
+  enum : uintptr_t { LockTag = TaskBase< void, void, void >::LockTag,
+                     EndTag  = TaskBase< void, void, void >::EndTag };
+
+  using execution_space = Kokkos::Qthread ;
+  using queue_type      = TaskQueue< execution_space > ;
+
+  template< typename > friend class Kokkos::TaskScheduler ;
+
+  typedef void (* function_type) ( TaskBase *, void * );
+
+  // sizeof(TaskBase) == 48
+
+  function_type  m_apply ;       ///< Apply function pointer
+  queue_type   * m_queue ;       ///< Queue in which this task resides
+  TaskBase     * m_dep ;         ///< Dependence
+  int32_t        m_ref_count ;   ///< Reference count
+  int32_t        m_alloc_size ;  ///< Allocation size
+  int32_t        m_dep_count ;   ///< Aggregate's number of dependences
+  int16_t        m_task_type ;   ///< Type of task
+  int16_t        m_priority ;    ///< Priority of runnable task
+  aligned_t      m_qfeb ;        ///< Qthread full/empty bit
+  int            m_state ;       ///< State of the task
+
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr TaskBase() noexcept
+    : m_apply(0),
+      m_queue(0),
+      m_dep(0),
+      m_ref_count(0),
+      m_alloc_size(0),
+      m_dep_count(0),
+      m_task_type( TaskSingle ),
+      m_priority( 1 /* TaskRegularPriority */ ),
+      m_qfeb(0),
+      m_state( queue_type::TASK_STATE_CONSTRUCTING )
+    {
+      qthread_empty( & m_qfeb ); // Set to full when complete
+    }
+
+  //----------------------------------------
+
+  static aligned_t qthread_func( void * arg );
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase ** aggregate_dependences()
+    { return reinterpret_cast<TaskBase**>( this + 1 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void requested_respawn()
+    { return m_state == queue_type::TASK_STATE_RESPAWN; }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskBase* dep )
+    {
+      // Assign dependence to m_dep.  It will be processed in the subsequent
+      // call to schedule.  Error if the dependence is reset.
+      if ( 0 != Kokkos::atomic_exchange( & m_dep, dep ) ) {
+        Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
+      }
+
+      if ( 0 != dep ) {
+        // The future may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+        Kokkos::atomic_fetch_add( &(dep->m_ref_count), 1 );
+      }
+    }
+
+  using get_return_type = void ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_return_type get() const {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif // KOKKOS_QTHREADS_TASKQUEUE_HPP
+
diff --git a/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..152546fadcf3d97863d1f508a72df1ee8212662f
--- /dev/null
+++ b/packages/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
@@ -0,0 +1,441 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
+#define KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+void TaskQueue< ExecSpace >::Destroy::destroy_shared_allocation()
+{
+  m_queue->~TaskQueue();
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::TaskQueue
+  ( const TaskQueue< ExecSpace >::memory_space & arg_space,
+    unsigned const arg_memory_pool_capacity,
+    unsigned const arg_memory_pool_superblock_capacity_log2 )
+  : m_memory( arg_space,
+              arg_memory_pool_capacity,
+              arg_memory_pool_superblock_capacity_log2 )
+    m_team_size( unsigned( qthread_num_workers_local(NO_SHEPHERD) ) ),
+    m_accum_alloc(0),
+    m_count_alloc(0),
+    m_max_alloc(0),
+    m_ready_count(0)
+{}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::~TaskQueue()
+{
+  // Verify that ready count is zero.
+  if ( 0 != m_ready_count ) {
+    Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready or executing tasks");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+size_t TaskQueue< ExecSpace >::allocate_block_size( size_t n )
+{
+  return m_memory.allocate_block_size( n );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void * TaskQueue< ExecSpace >::allocate( size_t n )
+{
+  void * const p = m_memory.allocate(n);
+
+  if ( p ) {
+    Kokkos::atomic_increment( & m_accum_alloc );
+    Kokkos::atomic_increment( & m_count_alloc );
+
+    if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
+  }
+
+  return p ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::deallocate( void * p, size_t n )
+{
+  m_memory.deallocate( p, n );
+  Kokkos::atomic_decrement( & m_count_alloc );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+#if 0
+  printf( "schedule( 0x%lx { %d %d %d }\n",
+          uintptr_t(task),
+          task->m_task_type,
+          task->m_priority,
+          task->m_ref_count );
+#endif
+
+  // The task has been constructed and is waiting to be executed.
+  task->m_state = TASK_STATE_WAITING ;
+
+  if ( task->m_task_type != task_root_type::Aggregate ) {
+    // Scheduling a single or team task.
+
+    // Increment active task count before spawning.
+    Kokkos::atomic_increment( m_ready_count );
+
+    if ( task->m_dep == 0 ) {
+      // Schedule a task with no dependences.
+
+      if ( task_root_type::TaskTeam == task->m_task_type && m_team_size > 1 ) {
+        // If more than one shepherd spawn on a shepherd other than this shepherd
+        const int num_shepherd  = qthread_num_shepherds();
+        const int this_shepherd = qthread_shep();
+        int spawn_shepherd      = ( this_shepherd + 1 ) % num_shepherd ;
+
+#if 0
+        fprintf( stdout,
+                 "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
+                 qthread_shep(),
+                 qthread_worker_local(NULL),
+                 reinterpret_cast<unsigned long>(this),
+                 spawn_shepherd,
+                 m_team_size - 1
+               );
+        fflush(stdout);
+#endif
+
+        qthread_spawn_cloneable(
+          & task_root_type::qthread_func,
+          task,
+          0,
+          NULL,
+          0, // no depenedences
+          0, // dependences array
+          spawn_shepherd,
+          unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
+          m_team_size - 1
+        );
+      }
+      else {
+        qthread_spawn(
+          & task_root_type::qthread_func,
+          task,
+          0,
+          NULL,
+          0, // no depenedences
+          0, // dependences array
+          NO_SHEPHERD,
+          QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+        );
+      }
+    }
+    else if ( task->m_dep->m_task_type != task_root_type::Aggregate )
+    // Malloc the precondition array to pass to qthread_spawn().  For
+    // non-aggregate tasks, it is a single pointer since there are no
+    // dependences.  Qthreads will eventually free this allocation so memory will
+    // not be leaked. Is malloc thread-safe?  Should this call be guarded?  The
+    // memory can't be allocated from the pool allocator because Qthreads frees
+    // it using free().
+    aligned_t ** qprecon = (aligned_t **) malloc( sizeof(aligned_t *) );
+
+    *qprecon = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
+
+    if ( task->m_task_type == task_root_type::TaskTeam && m_team_size > 1) {
+      // If more than one shepherd spawn on a shepherd other than this shepherd
+      const int num_shepherd  = qthread_num_shepherds();
+      const int this_shepherd = qthread_shep();
+      int spawn_shepherd      = ( this_shepherd + 1 ) % num_shepherd ;
+
+#if 0
+  fprintf( stdout,
+           "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
+           qthread_shep(),
+           qthread_worker_local(NULL),
+           reinterpret_cast<unsigned long>(this),
+           spawn_shepherd,
+           m_team_size - 1
+         );
+  fflush(stdout);
+#endif
+
+      qthread_spawn_cloneable(
+        & Task::qthread_func,
+        this,
+        0,
+        NULL,
+        m_dep_size,
+        qprecon, /* dependences */
+        spawn_shepherd,
+        unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
+        m_team_size - 1
+      );
+    }
+    else {
+      qthread_spawn(
+        & Task::qthread_func, /* function */
+        this,                 /* function argument */
+        0,
+        NULL,
+        m_dep_size,
+        qprecon, /* dependences */
+        NO_SHEPHERD,
+        QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+      );
+    }
+  }
+  else {
+    // GEM: How do I handle an aggregate (when_all) task?
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::reschedule( task_root_type * task )
+{
+  // Precondition:
+  //   task is in Executing state
+  //   task->m_next == LockTag
+  //
+  // Postcondition:
+  //   task is in Executing-Respawn state
+  //   task->m_next == 0 (no dependence)
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+
+  if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
+    Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::complete
+  ( TaskQueue< ExecSpace >::task_root_type * task )
+{
+  // Complete a runnable task that has finished executing
+  // or a when_all task when all of its dependeneces are complete.
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+#if 0
+  printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n",
+          uintptr_t(task),
+          uintptr_t(task->m_wait),
+          uintptr_t(task->m_next),
+          task->m_task_type,
+          task->m_priority,
+          task->m_ref_count
+        );
+  fflush( stdout );
+#endif
+
+  const bool runnable = task_root_type::Aggregate != task->m_task_type ;
+
+  //----------------------------------------
+
+  if ( runnable && lock != task->m_next ) {
+    // Is a runnable task has finished executing and requested respawn.
+    // Schedule the task for subsequent execution.
+
+    schedule( task );
+  }
+  //----------------------------------------
+  else {
+    // Is either an aggregate or a runnable task that executed
+    // and did not respawn.  Transition this task to complete.
+
+    // If 'task' is an aggregate then any of the runnable tasks that
+    // it depends upon may be attempting to complete this 'task'.
+    // Must only transition a task once to complete status.
+    // This is controled by atomically locking the wait queue.
+
+    // Stop other tasks from adding themselves to this task's wait queue
+    // by locking the head of this task's wait queue.
+
+    task_root_type * x = Kokkos::atomic_exchange( & task->m_wait, lock );
+
+    if ( x != (task_root_type *) lock ) {
+
+      // This thread has transitioned this 'task' to complete.
+      // 'task' is no longer in a queue and is not executing
+      // so decrement the reference count from 'task's creation.
+      // If no other references to this 'task' then it will be deleted.
+
+      TaskQueue::assign( & task, zero );
+
+      // This thread has exclusive access to the wait list so
+      // the concurrency-safe pop_task function is not needed.
+      // Schedule the tasks that have been waiting on the input 'task',
+      // which may have been deleted.
+
+      while ( x != end ) {
+
+        // Set x->m_next = zero  <=  no dependence
+
+        task_root_type * const next =
+          (task_root_type *) Kokkos::atomic_exchange( & x->m_next, zero );
+
+        schedule( x );
+
+        x = next ;
+      }
+    }
+  }
+
+  if ( runnable ) {
+    // A runnable task was popped from a ready queue and executed.
+    // If respawned into a ready queue then the ready count was incremented
+    // so decrement whether respawned or not.
+    Kokkos::atomic_decrement( & m_ready_count );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template<>
+aligned_t
+TaskBase< Kokkos::Qthreads, void, void >::qthread_func( void * arg )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = Kokkos::Impl::QthreadsTeamPolicyMember;
+
+  task_root_type * const task = reinterpret_cast< task_root_type * >( arg );
+
+  // First member of the team change state to executing.
+  // Use compare-exchange to avoid race condition with a respawn.
+  Kokkos::atomic_compare_exchange_strong( & task->m_state,
+                                          queue_type::TASK_STATE_WAITING,
+                                          queue_type::TASK_STATE_EXECUTING
+                                        );
+
+  if ( task_root_type::TaskTeam == task->m_task_type )
+  {
+    if ( 1 < task->m_queue->m_team_size ) {
+      // Team task with team size of more than 1.
+      Member::TaskTeam task_team_tag ;
+
+      // Initialize team size and rank with shephered info
+      Member member( task_team_tag );
+
+      (*task->m_apply)( task , & member );
+
+#if 0
+      fprintf( stdout,
+              "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n",
+              qthread_shep(),
+              qthread_worker_local(NULL),
+              reinterpret_cast<unsigned long>(task),
+              member.team_rank(),
+              member.team_size()
+            );
+      fflush(stdout);
+#endif
+
+      member.team_barrier();
+      if ( member.team_rank() == 0 ) task->closeout();
+      member.team_barrier();
+    }
+    else {
+      // Team task with team size of 1.
+      Member member ;
+      (*task->m_apply)( task , & member );
+      task->closeout();
+    }
+  }
+  else {
+    (*task->m_apply)( task );
+    task->closeout();
+  }
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx return\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       );
+fflush(stdout);
+#endif
+
+  return 0 ;
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif // KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
+
diff --git a/packages/kokkos/core/src/Qthreads/README b/packages/kokkos/core/src/Qthreads/README
new file mode 100644
index 0000000000000000000000000000000000000000..e35b1f698ec7ca3e3ee020eeee4445de43023c78
--- /dev/null
+++ b/packages/kokkos/core/src/Qthreads/README
@@ -0,0 +1,24 @@
+
+# This Qthreads back-end uses an experimental branch of the Qthreads repository with special #define options.
+
+# Cloning repository and branch:
+
+git clone git@github.com:Qthreads/qthreads.git qthreads
+
+cd qthreads
+
+# checkout branch with "cloned tasks"
+
+git checkout dev-kokkos
+
+# Configure/autogen
+
+sh autogen.sh
+
+# configure with 'hwloc' installation:
+
+./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR}
+
+# install
+
+make install
diff --git a/packages/kokkos/core/src/ROCm/KokkosExp_ROCm_IterateTile_Refactor.hpp b/packages/kokkos/core/src/ROCm/KokkosExp_ROCm_IterateTile_Refactor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d66d016cb62eacba74cc409b91201ba32b82783
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/KokkosExp_ROCm_IterateTile_Refactor.hpp
@@ -0,0 +1,2750 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ROCM_EXP_ITERATE_TILE_REFACTOR_HPP
+#define KOKKOS_ROCM_EXP_ITERATE_TILE_REFACTOR_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( __HCC__ ) && defined( KOKKOS_ENABLE_ROCM )
+
+#include <iostream>
+#include <algorithm>
+#include <cstdio>
+
+#include <utility>
+
+// #include<ROCm/Kokkos_ROCmExec.hpp>
+// Including the file above leads to following type of errors:
+// /home/ndellin/kokkos/core/src/ROCm/Kokkos_ROCmExec.hpp(84): error: incomplete type is not allowed
+// use existing Kokkos functionality, e.g. max blocks, once resolved
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+
+#define threadIdx_x (hc_get_workitem_id(0))
+#define threadIdx_y (hc_get_workitem_id(1))
+#define threadIdx_z (hc_get_workitem_id(2))
+
+#define blockIdx_x  (hc_get_group_id(0))
+#define blockIdx_y  (hc_get_group_id(1))
+#define blockIdx_z  (hc_get_group_id(2))
+
+#define blockDim_x  (hc_get_group_size(0))
+#define blockDim_y  (hc_get_group_size(1))
+#define blockDim_z  (hc_get_group_size(2))
+
+#define gridDim_x   (hc_get_num_groups(0))
+#define gridDim_y   (hc_get_num_groups(1))
+#define gridDim_z   (hc_get_num_groups(2))
+
+
+namespace Kokkos { namespace Impl {
+
+namespace Refactor {
+
+// ------------------------------------------------------------------ //
+// ParallelFor iteration pattern
+template< int N , typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile;
+
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<2,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id1 = (index_type)blockIdx_y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim_y ) {
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[1];
+        if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx_y < m_rp.m_tile[1] ) {
+
+          for ( index_type tile_id0 = (index_type)blockIdx_x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim_x ) {
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx_x + (index_type)m_rp.m_lower[0];
+            if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx_x < m_rp.m_tile[0] ) {
+              m_func(offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx_x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim_x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx_x + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx_x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx_y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim_y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx_y < m_rp.m_tile[1] ) {
+              m_func(offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<2,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      // Loop over size maxnumblocks until full range covered
+      for ( index_type tile_id1 = (index_type)blockIdx_y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim_y ) {
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[1];
+        if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx_y < m_rp.m_tile[1] ) {
+
+          for ( index_type tile_id0 = (index_type)blockIdx_x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim_x ) {
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx_x + (index_type)m_rp.m_lower[0];
+            if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx_x < m_rp.m_tile[0] ) {
+              m_func(Tag(), offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx_x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim_x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx_x + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx_x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx_y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim_y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx_y < m_rp.m_tile[1] ) {
+              m_func(Tag(), offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<3,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = (index_type)blockIdx_z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim_z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[2];
+        if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx_z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx_y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim_y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx_y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = (index_type)blockIdx_x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim_x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx_x + (index_type)m_rp.m_lower[0];
+                if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx_x < m_rp.m_tile[0] ) {
+                  m_func(offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx_x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim_x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx_x + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx_x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx_y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim_y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx_y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx_z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim_z ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx_z < m_rp.m_tile[2] ) {
+                  m_func(offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<3,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = (index_type)blockIdx_z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim_z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[2];
+        if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx_z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx_y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim_y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx_y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = (index_type)blockIdx_x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim_x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx_x + (index_type)m_rp.m_lower[0];
+                if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx_x < m_rp.m_tile[0] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx_x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim_x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx_x + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx_x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx_y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim_y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx_y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx_z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim_z ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx_z < m_rp.m_tile[2] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<4,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx_x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx_x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx_x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = (index_type)blockIdx_z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim_z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[3];
+        if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx_z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = (index_type)blockIdx_y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim_y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[2];
+            if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx_y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx_x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx_x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx_x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx_y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim_y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx_y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = (index_type)blockIdx_z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim_z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx_z < m_rp.m_tile[3] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<4,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx_x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx_x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx_x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = (index_type)blockIdx_z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim_z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[3];
+        if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx_z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = (index_type)blockIdx_y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim_y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[2];
+            if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx_y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx_x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx_x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx_x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx_y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim_y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx_y + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx_y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = (index_type)blockIdx_z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim_z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx_z < m_rp.m_tile[3] ) {
+                      m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<5,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx_x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx_x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx_x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx_y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx_y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx_y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx_y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = (index_type)blockIdx_z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim_z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[4];
+        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx_z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx_x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx_x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx_x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx_y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx_y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx_y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx_y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = (index_type)blockIdx_z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim_z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx_z < m_rp.m_tile[4] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<5,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx_x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx_x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx_x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx_y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx_y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx_y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx_y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = (index_type)blockIdx_z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim_z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[4];
+        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx_z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx_x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx_x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx_x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx_y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx_y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx_y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx_y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = (index_type)blockIdx_z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim_z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx_z + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx_z < m_rp.m_tile[4] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<6,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx_x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx_x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx_x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx_y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx_y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx_y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx_y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx_z % numbl4;
+      const index_type tile_id5 = (index_type)blockIdx_z / numbl4;
+      const index_type thr_id4 = (index_type)threadIdx_z % m_rp.m_tile[4];
+      const index_type thr_id5 = (index_type)threadIdx_z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx_x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx_x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx_x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx_y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx_y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx_y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx_y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx_z / numbl5;
+      const index_type tile_id5 = (index_type)blockIdx_z % numbl5;
+      const index_type thr_id4 = (index_type)threadIdx_z / m_rp.m_tile[5];
+      const index_type thr_id5 = (index_type)threadIdx_z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<6,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx_x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx_x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx_x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx_y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx_y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx_y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx_y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx_z % numbl4;
+      const index_type tile_id5 = (index_type)blockIdx_z / numbl4;
+      const index_type thr_id4 = (index_type)threadIdx_z % m_rp.m_tile[4];
+      const index_type thr_id5 = (index_type)threadIdx_z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx_x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx_x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx_x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx_x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx_y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx_y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx_y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx_y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx_z / numbl5;
+      const index_type tile_id5 = (index_type)blockIdx_z % numbl5;
+      const index_type thr_id4 = (index_type)threadIdx_z / m_rp.m_tile[5];
+      const index_type thr_id5 = (index_type)threadIdx_z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+} // Refactor
+
+// ----------------------------------------------------------------------------------
+
+namespace Reduce {
+
+template < typename T >
+using is_void = std::is_same< T, void >;
+
+template < typename T >
+struct is_array_type : std::false_type
+{
+  using value_type = T;
+};
+
+template < typename T >
+struct is_array_type< T* > : std::true_type
+{
+  using value_type = T;
+};
+
+template < typename T >
+struct is_array_type< T[] > : std::true_type
+{
+  using value_type = T;
+};
+
+// ------------------------------------------------------------------ //
+template< int N , typename RP , typename Functor , typename Tag , typename ValueType , typename Enable = void >
+struct DeviceIterateTile;
+
+// ParallelReduce iteration pattern
+// Scalar reductions
+
+// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of tiles and reduction algorithm constraints
+// extract n-dim tile offsets (i.e. tile's global starting mulit-index) from the tileid = blockid using tile dimensions
+// local indices within a tile extracted from (index_type)threadIdx_x using tile dims, constrained by blocksize
+// combine tile and local id info for multi-dim global ids
+
+// Pattern:
+// Each block+thread is responsible for a tile+local_id combo (additional when striding by num_blocks)
+// 1. create offset arrays
+// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max num blocks)
+// 3. temps set for tile_idx and thrd_idx, which will be modified
+// 4. if LL vs LR:
+//      determine tile starting point offsets (multidim)
+//      determine local index offsets (multidim)
+//      concatentate tile offset + local offset for global multi-dim index
+//    if offset withinin range bounds AND local offset within tile bounds, call functor
+
+// ValueType = T
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            // Deduce this blocks tile_id
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          {
+            m_func( m_offset[0], m_offset[1], m_v );
+          }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// ValueType = T[], T*
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  [[hc]]
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          {
+            m_func( m_offset[0], m_offset[1], m_v );
+          }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+      } //end for loop over num_tiles - product of tiles in each direction
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount);
+
+  KOKKOS_INLINE_FUNCTION
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::ROCmTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::rocm_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx_x < m_rp.m_num_tiles && (index_type)threadIdx_y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx_x; tileidx < m_rp.m_num_tiles; tileidx += gridDim_x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx_y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx_y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+} // Reduce
+
+// ----------------------------------------------------------------------------------
+
+} } //end namespace Kokkos::Impl
+#undef threadIdx_x
+#undef threadIdx_y
+#undef threadIdx_z
+
+#undef blockIdx_x
+#undef blockIdx_y
+#undef blockIdx_z
+
+#undef blockDim_x
+#undef blockDim_y
+#undef blockDim_z
+
+#undef gridDim_x
+#undef gridDim_y
+#undef gridDim_z
+
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d220cfd419df1938b2a9d805e479b573eef1efc
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
@@ -0,0 +1,477 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <hc.hpp>
+//#include <hsa_atomic.h>
+
+#ifdef KOKKOS_ENABLE_ROCM_ATOMICS
+namespace Kokkos {
+  //ROCm can do:
+  //Types int/unsigned int
+  //variants: atomic_exchange/compare_exchange/fetch_add/fetch_sub/fetch_max/fetch_min/fetch_and/fetch_or/fetch_xor/fetch_inc/fetch_dec 
+
+
+  KOKKOS_INLINE_FUNCTION
+  int atomic_exchange(int* dest, const int& val) {
+    return hc::atomic_exchange_int(dest, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  unsigned int atomic_exchange(unsigned int* dest, const unsigned int& val) {
+    return hc::atomic_exchange_unsigned(dest, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int64_t atomic_exchange(int64_t* dest, const int64_t& val) {
+    return (int64_t)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  uint64_t atomic_exchange(uint64_t* dest, const uint64_t& val) {
+    return hc::atomic_exchange_uint64(dest, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  long long atomic_exchange(long long* dest, const long long& val) {
+    return (long long)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  unsigned long long atomic_exchange(unsigned long long* dest, const unsigned long long& val) {
+    return (unsigned long long)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  float atomic_exchange(float* dest, const float& val) {
+    union U {
+      int i ;
+      float f ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,ival;
+    idest.f = *dest;
+    ival.f = val;
+    idest.i = hc::atomic_exchange_int((int*)dest, ival.i);
+    return idest.f;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  double atomic_exchange(double* dest, const double& val) {
+    union U {
+      uint64_t i ;
+      double d ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,ival;
+    idest.d = *dest;
+    ival.d = val;
+    idest.i = hc::atomic_exchange_uint64((uint64_t*)dest, ival.i);
+    return idest.d;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int atomic_compare_exchange(int* dest, int compare, const int& val);
+
+  KOKKOS_INLINE_FUNCTION
+  int64_t atomic_compare_exchange(int64_t* dest, int64_t compare, const int64_t& val);
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
+    union U {
+      int i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+    assume.i = oldval.i ;
+    newval.t = val ;
+    atomic_compare_exchange( (int*)(dest) , assume.i, newval.i );
+
+    return oldval.t ;    
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
+    union U {
+      uint64_t i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+
+    assume.i = oldval.i ;
+    newval.t = val ;
+    atomic_compare_exchange( (int64_t*)(dest) , assume.i, newval.i );
+
+    return oldval.t ;    
+  }
+ 
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(int64_t), const T&>::type val) {
+    return val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int atomic_compare_exchange(int* dest, int compare, const int& val) {
+    return hc::atomic_compare_exchange_int(dest, compare, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  unsigned int atomic_compare_exchange(unsigned int* dest, unsigned int compare, const unsigned int& val) {
+    return hc::atomic_compare_exchange_unsigned(dest, compare, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int64_t atomic_compare_exchange(int64_t* dest, int64_t compare, const int64_t& val) {
+    return (int64_t) hc::atomic_compare_exchange_uint64((uint64_t*)dest, (uint64_t)compare, (const uint64_t&)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  uint64_t atomic_compare_exchange(uint64_t* dest, uint64_t compare, const uint64_t& val) {
+    return hc::atomic_compare_exchange_uint64(dest, compare, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  long long atomic_compare_exchange(long long* dest, long long compare, const long long& val) {
+    return (long long)hc::atomic_compare_exchange_uint64((uint64_t*)(dest), (uint64_t)(compare), (const uint64_t&)(val));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  float atomic_compare_exchange(float* dest, float compare, const float& val) {
+    union U {
+      int i ;
+      float f ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,icompare,ival;
+    idest.f = *dest;
+    icompare.f = compare;
+    ival.f = val;
+    idest.i = hc::atomic_compare_exchange_int(reinterpret_cast<int*>(dest), icompare.i, ival.i);
+    return idest.f;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  double atomic_compare_exchange(double* dest, double compare, const double& val) {
+    union U {
+      uint64_t i ;
+      double d ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,icompare,ival;
+    idest.d = *dest;
+    icompare.d = compare;
+    ival.d = val;
+    idest.i = hc::atomic_compare_exchange_uint64(reinterpret_cast<uint64_t*>(dest), icompare.i, ival.i);
+    return idest.d;
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
+    union U {
+      int i ;
+      T f ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,icompare,ival;
+    idest.f = *dest;
+    icompare.f = compare;
+    ival.f = val;
+    idest.i = hc::atomic_compare_exchange_int((int*)(dest), icompare.i, ival.i);
+    return idest.f;
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<sizeof(T) == sizeof(int64_t), const T&>::type val) {
+    union U {
+      uint64_t i ;
+      T f ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } idest,icompare,ival;
+    idest.f = *dest;
+    icompare.f = compare;
+    ival.f = val;
+    idest.i = hc::atomic_compare_exchange_uint64((uint64_t*)(dest), icompare.i, ival.i);
+    return idest.f;
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<(sizeof(T) != sizeof(int32_t)) && (sizeof(T) != sizeof(int64_t)), const T&>::type val) {
+    return val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int atomic_fetch_add (volatile int * dest, const int& val) {
+    return hc::atomic_fetch_add((int *)dest, val);
+  }
+  
+  KOKKOS_INLINE_FUNCTION
+  unsigned int atomic_fetch_add(unsigned int* dest, const unsigned int& val) {
+    return hc::atomic_fetch_add(dest, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  unsigned long atomic_fetch_add(volatile unsigned long* dest, const unsigned long& val) {
+    return (unsigned long)hc::atomic_fetch_add((uint64_t *)dest, (const uint64_t)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int64_t atomic_fetch_add(volatile int64_t* dest, const int64_t& val) {
+    return (int64_t)hc::atomic_fetch_add((uint64_t *)dest, (const uint64_t&)val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  char atomic_fetch_add(volatile char * dest, const char& val) {
+    unsigned int oldval,newval,assume;
+    oldval = *(int *)dest ;
+
+    do {
+      assume = oldval ;
+      newval = assume&0x7fffff00 + ((assume&0xff)+val)&0xff ;
+      oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+    } while ( assume != oldval );
+
+    return oldval ;    
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  short atomic_fetch_add(volatile short * dest, const short& val) {
+    unsigned int oldval,newval,assume;
+    oldval = *(int *)dest ;
+
+    do {
+      assume = oldval ;
+      newval = assume&0x7fff0000 + ((assume&0xffff)+val)&0xffff ;
+      oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+    } while ( assume != oldval );
+
+    return oldval ;    
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  long long atomic_fetch_add(volatile long long * dest, const long long& val) {
+    return (long long)hc::atomic_fetch_add((uint64_t*)dest, (const uint64_t&)val);
+  }
+
+
+
+  KOKKOS_INLINE_FUNCTION
+  int atomic_fetch_sub (volatile int * dest, const int& val) {
+    return hc::atomic_fetch_sub((int *)dest, val);
+  }
+  
+  KOKKOS_INLINE_FUNCTION
+  unsigned int atomic_fetch_sub(volatile unsigned int* dest, const unsigned int& val) {
+    return hc::atomic_fetch_sub((unsigned int *)dest, val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int64_t atomic_fetch_sub(int64_t* dest, const int64_t& val) {
+    return (int64_t)hc::atomic_fetch_add((uint64_t *)dest, -(const uint64_t&)val);
+//    return (int64_t)hc::atomic_fetch_sub_uint64((uint64_t*)dest, (const uint64_t&)val);
+  }
+  
+  KOKKOS_INLINE_FUNCTION
+  char atomic_fetch_sub(volatile char * dest, const char& val) {
+    unsigned int oldval,newval,assume;
+    oldval = *(int *)dest ;
+
+    do {
+      assume = oldval ;
+      newval = assume&0x7fffff00 + ((assume&0xff)-val)&0xff ;
+      oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+    } while ( assume != oldval );
+
+    return oldval ;    
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  short atomic_fetch_sub(volatile short * dest, const short& val) {
+    unsigned int oldval,newval,assume;
+    oldval = *(int *)dest ;
+
+    do {
+      assume = oldval ;
+      newval = assume&0x7fff0000 + ((assume&0xffff)-val)&0xffff;
+      oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+    } while ( assume != oldval );
+
+    return oldval ;    
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  long long atomic_fetch_sub(volatile long long * dest, const long long& val) {
+    return (long long)hc::atomic_fetch_add((uint64_t*)dest, -(const uint64_t&)val);
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
+    union U {
+      unsigned int i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+
+    do {
+      assume.i = oldval.i ;
+      newval.t = assume.t + val ;
+      oldval.i = atomic_compare_exchange( (unsigned int*)(dest) , assume.i , newval.i );
+    } while ( assume.i != oldval.i );
+
+    return oldval.t ;    
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
+    union U {
+      uint64_t i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+
+    do {
+      assume.i = oldval.i ;
+      newval.t = assume.t + val ;
+      oldval.i = atomic_compare_exchange( (uint64_t*)dest , assume.i , newval.i );
+    } while ( assume.i != oldval.i );
+
+    return oldval.t ;    
+  }
+
+
+  //WORKAROUND
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(int64_t), const T&>::type val) {
+    return val ;
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_sub(volatile T* dest, typename std::enable_if<sizeof(T) == sizeof(int),T>::type val) {
+    union U {
+      int i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+
+    do {
+      assume.i = oldval.i ;
+      newval.t = assume.t - val ;
+      oldval.i = Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+    } while ( assume.i != oldval.i );
+
+    return oldval.t ;
+  }
+
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_sub(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
+    union U {
+      int64_t i ;
+      T t ;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } assume , oldval , newval ;
+
+    oldval.t = *dest ;
+
+    do {
+      assume.i = oldval.i ;
+      newval.t = assume.t - val ;
+      oldval.i = atomic_compare_exchange( (int64_t*)dest , assume.i , newval.i );
+    } while ( assume.i != oldval.i );
+
+    return oldval.t ;    
+  }
+//  KOKKOS_INLINE_FUNCTION
+//  char atomic_fetch_sub(volatile char * dest, const char& val) {
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_sub(volatile T* dest, typename std::enable_if<sizeof(T) == sizeof(char),T>::type val) {
+
+    unsigned int oldval,newval,assume;
+    oldval = *(int *)dest ;
+
+    do {
+      assume = oldval ;
+      newval = assume&0x7fffff00 + ((assume&0xff)-val)&0xff ;
+      oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+    } while ( assume != oldval );
+
+    return (T) oldval&0xff ;
+  }
+
+//  KOKKOS_INLINE_FUNCTION
+//  short atomic_fetch_sub(volatile short * dest, const short& val) {
+  template<class T>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_fetch_sub(volatile T* dest, typename std::enable_if<sizeof(T) == sizeof(short),T>::type val) {
+
+    unsigned int oldval,newval,assume;
+    oldval = *(int *)dest ;
+
+    do {
+      assume = oldval ;
+      newval = assume&0x7fff0000 + ((assume&0xffff)-val)&0xffff;
+      oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+    } while ( assume != oldval );
+
+    return (T) oldval&0xffff ;
+  }
+
+
+
+}
+#endif
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Config.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Config.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..71643458b471d45e0cc3743fea1233f3b8c39f54
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Config.hpp
@@ -0,0 +1,51 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef GUARD_CORE_KOKKOS_ROCM_CONFIG_HPP
+#define GUARD_CORE_KOKKOS_ROCM_CONFIG_HPP
+
+#ifndef KOKKOS_ROCM_HAS_WORKAROUNDS
+#define KOKKOS_ROCM_HAS_WORKAROUNDS 1
+#endif
+
+#endif
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4be18d03be763bea6aa0c54148bbd07b572e675
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp
@@ -0,0 +1,133 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ROCMEXEC_HPP
+#define KOKKOS_ROCMEXEC_HPP
+
+#include <algorithm>
+#include <typeinfo>
+#include <Kokkos_Macros.hpp>
+//#include <ROCm/Kokkos_ROCmExec.hpp>
+#include <hc.hpp>
+
+#define ROCM_SPACE_ATOMIC_MASK      0x1FFFF
+#define ROCM_SPACE_ATOMIC_XOR_MASK  0x15A39
+#define ROCM_CONCURRENCY 20480
+//#define ROCM_CONCURRENCY 81920  # for fiji
+
+namespace Kokkos {
+  static int rocm_space_atomic_locks[ROCM_SPACE_ATOMIC_MASK+1];
+  static int rocm_space_scratch_locks[ROCM_CONCURRENCY];
+  static int rocm_space_threadid_locks[ROCM_CONCURRENCY];
+namespace Impl {
+// TODO: mimic cuda implemtation, add dgpu capability
+
+  void init_rocm_atomic_lock_array() {
+    static int is_initialized = 0;
+    if(!is_initialized)
+    {
+      for(int i = 0; i < ROCM_SPACE_ATOMIC_MASK+1; i++)
+        rocm_space_atomic_locks[i] = 0;
+      is_initialized = 1;
+    }
+  }
+
+  void init_rocm_scratch_lock_array() {
+    static int is_initialized = 0;
+    if(!is_initialized)
+    {
+      for(int i = 0; i < ROCM_CONCURRENCY; i++)
+        rocm_space_scratch_locks[i] = 0;
+      is_initialized = 1;
+    }
+  }
+
+  void init_rocm_threadid_lock_array() {
+    static int is_initialized = 0;
+    if(!is_initialized)
+    {
+      for(int i = 0; i < ROCM_CONCURRENCY; i++)
+        rocm_space_threadid_locks[i] = 0;
+      is_initialized = 1;
+    }
+  }
+
+  void init_lock_arrays_rocm_space() {
+     init_rocm_atomic_lock_array();
+//     init_rocm_scratch_lock_array();
+//     init_rocm_threadid_lock_array();
+  }
+}
+
+} // namespace Kokkos
+#if 0
+namespace Kokkos {
+namespace Impl {
+KOKKOS_INLINE_FUNCTION
+bool lock_address_rocm_space(void* ptr) {
+#if 0
+return(Kokkos::Impl::lock_address_host_space(ptr));
+#else
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & ROCM_SPACE_ATOMIC_MASK;
+  return (0 == hc::atomic_compare_exchange(&rocm_space_atomic_locks[offset],0,1));
+#endif
+}
+
+KOKKOS_INLINE_FUNCTION
+void unlock_address_rocm_space(void* ptr) {
+#if 0
+Kokkos::Impl::unlock_address_host_space(ptr) ;
+#else
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & ROCM_SPACE_ATOMIC_MASK;
+  hc::atomic_exchange( &rocm_space_atomic_locks[ offset ], 0);
+#endif
+}
+
+}
+} // namespace Kokkos
+#endif
+
+#endif /* #ifndef KOKKOS_ROCMEXEC_HPP */
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c0d5b9004ca532cface778738bef87934afcf692
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp
@@ -0,0 +1,258 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ROCMEXEC_HPP
+#define KOKKOS_ROCMEXEC_HPP
+
+#include <algorithm>
+#include <typeinfo>
+
+#if defined(__HCC_ACCELERATOR__)
+#define printf(...)
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+struct ROCmTraits {
+// TODO: determine if needed
+  enum { WavefrontSize       = 64 /* 64  */ };
+  enum { WorkgroupSize       = 64 /* 64  */ };
+  enum { WavefrontIndexMask  = 0x001f  /* Mask for warpindex */ };
+  enum { WavefrontIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
+
+  enum { SharedMemoryBanks    = 32      /* Compute device 2.0 */ };
+  enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
+  enum { SharedMemoryUsage    = 0x04000 /* 16k shared / 48k L1 Cache */ };
+
+  enum { UpperBoundExtentCount    = 4294967295 /* Hard upper bound */ };
+#if 0
+  KOKKOS_INLINE_FUNCTION static
+  ROCmSpace::size_type wavefront_count( ROCmSpace::size_type i )
+    { return ( i +  WavefrontIndexMask ) >>  WavefrontIndexShift ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  ROCmSpace::size_type wavefront_align( ROCmSpace::size_type i )
+    {
+      enum { Mask = ~ROCmSpace::size_type(  WavefrontIndexMask ) };
+      return ( i +  WavefrontIndexMask ) & Mask ;
+    }
+#endif
+};
+size_t rocm_internal_cu_count();
+size_t rocm_internal_maximum_workgroup_count();
+
+size_t * rocm_internal_scratch_flags( const size_t size );
+size_t * rocm_internal_scratch_space( const size_t size );
+
+}
+} // namespace Kokkos
+#define ROCM_SPACE_ATOMIC_MASK      0x1FFFF
+#define ROCM_SPACE_ATOMIC_XOR_MASK  0x15A39
+//int rocm_space_atomic_locks[ROCM_SPACE_ATOMIC_MASK+1];
+extern int
+   *rocm_space_atomic_locks;
+
+namespace Kokkos {
+namespace Impl {
+  void init_lock_arrays_rocm_space();
+
+  void* rocm_resize_scratch_space(size_t bytes, bool force_shrink = false);
+
+// TODO: determine if needed
+KOKKOS_INLINE_FUNCTION
+bool lock_address_rocm_space(void* ptr) {
+#if 0
+return(Kokkos::Impl::lock_address_host_space(ptr));
+#else
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & ROCM_SPACE_ATOMIC_MASK;
+  return (0 == hc::atomic_compare_exchange(&rocm_space_atomic_locks[offset],0,1));
+#endif
+}
+KOKKOS_INLINE_FUNCTION
+void unlock_address_rocm_space(void* ptr) {
+#if 0
+Kokkos::Impl::unlock_address_host_space(ptr) ;
+#else
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & ROCM_SPACE_ATOMIC_MASK;
+  hc::atomic_exchange( &rocm_space_atomic_locks[ offset ], 0);
+#endif
+}
+
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+//extern 
+//KOKKOS_INLINE_FUNCTION
+//void init_lock_arrays_rocm_space(); 
+
+
+}
+} // namespace Kokkos
+//#if defined(__HCC_ACCELERATOR__)
+namespace Kokkos {
+namespace Impl {
+/*
+template< class DriverType>
+__global__
+static void rocm_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_rocm_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void rocm_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_rocm_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType>
+__global__
+static void rocm_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void rocm_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+*/
+template < class DriverType
+         , class LaunchBounds = Kokkos::LaunchBounds<> >
+struct ROCmParallelLaunch ;
+
+template < class DriverType
+         , unsigned int MaxThreadsPerBlock
+         , unsigned int MinBlocksPerSM >
+struct ROCmParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds< MaxThreadsPerBlock
+                                               , MinBlocksPerSM >>
+{
+  inline
+  ROCmParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int      shmem )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+      if ( ROCmTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("ROCmParallelLaunch FAILED: shared memory request is too large") );
+      }
+      DriverType * rocm_memory_buffer = (DriverType *)
+                                     rocm_device_allocate(sizeof(DriverType));
+      // Copy functor to constant memory on the device
+      Kokkos::Impl::DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace>
+              ( rocm_memory_buffer , (void *)&driver , sizeof(DriverType) );
+
+//      KOKKOS_ENSURE_ROCM_LOCK_ARRAYS_ON_DEVICE();
+
+      // Invoke the driver function on the device
+      auto ext = hc::extent<3>(grid.x,grid.y,grid.z);
+ 
+      hc::parallel_for_each(ext.tile_with_dynamic(block.x,block.y,block.z,shmem), [=](const hc::index<3> & idx) [[hc]]
+      { rocm_memory_buffer->operator()();
+      }).wait();
+      rocm_device_free(rocm_memory_buffer);
+
+//#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+//      ROCM_SAFE_CALL( rocmGetLastError() );
+//      Kokkos::ROCm::fence();
+//#endif
+    }
+  }
+};
+template < class DriverType >
+struct ROCmParallelLaunch< DriverType
+                         , Kokkos::LaunchBounds<>>
+{
+  inline
+  ROCmParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+      if ( ROCmTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("ROCmParallelLaunch FAILED: shared memory request is too large") );
+      }
+
+      DriverType * rocm_memory_buffer = (DriverType *)
+                                     rocm_device_allocate(sizeof(DriverType));
+      // Copy functor to constant memory on the device
+      Kokkos::Impl::DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace>
+              ( rocm_memory_buffer , (void *)&driver , sizeof(DriverType) );
+
+//      KOKKOS_ENSURE_ROCM_LOCK_ARRAYS_ON_DEVICE();
+      // Invoke the driver function on the device
+      auto ext = hc::extent<3>(grid.x,grid.y,grid.z);
+ 
+      hc::parallel_for_each(ext.tile_with_dynamic(block.x,block.y,block.z,shmem), [=](const hc::index<3> & idx) [[hc]]
+ 
+      { rocm_memory_buffer->operator()();
+      }).wait();
+      rocm_device_free(rocm_memory_buffer);
+    }
+  }
+};
+} // namespace Impl
+} // namespace Kokkos
+
+
+#endif /* #ifndef KOKKOS_ROCMEXEC_HPP */
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ae312647e6d038cf561860ac7c81bf3fbce7bf8
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
@@ -0,0 +1,754 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/*--------------------------------------------------------------------------*/
+/* Kokkos interfaces */
+
+#include <Kokkos_Core.hpp>
+
+/* only compile this file if ROCM is enabled for Kokkos */
+#ifdef KOKKOS_ENABLE_ROCM
+
+//#include <ROCm/Kokkos_ROCm_Internal.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_ROCmSpace.hpp>
+#include <ROCm/Kokkos_ROCm_Exec.hpp>
+
+/*--------------------------------------------------------------------------*/
+/* Standard 'C' libraries */
+#include <stdlib.h>
+
+/* Standard 'C++' libraries */
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+
+
+//KOKKOS_INLINE_FUNCTION
+// Kokkos::Impl::ROCmLockArraysStruct kokkos_impl_rocm_lock_arrays ;
+
+
+/*--------------------------------------------------------------------------*/
+namespace Kokkos {
+namespace Impl {
+
+#if 0
+namespace {
+__global__
+void query_rocm_kernel_arch( int * d_arch )
+{
+#if defined( __HCC_ACCELERATOR__ )
+  *d_arch = OCM_ARCH__ ;
+#else
+  *d_arch = 0 ;
+#endif
+}
+
+/** Query what compute capability is actually launched to the device: */
+int rocm_kernel_arch()
+{
+  int * d_arch = 0 ;
+  rocmMalloc( (void **) & d_arch , sizeof(int) );
+  query_rocm_kernel_arch<<<1,1>>>( d_arch );
+  int arch = 0 ;
+  rocmMemcpy( & arch , d_arch , sizeof(int) , rocmMemcpyDefault );
+  rocmFree( d_arch );
+  return arch ;
+}
+bool rocm_launch_blocking()
+{
+  const char * env = getenv("ROCM_LAUNCH_BLOCKING");
+
+  if (env == 0) return false;
+
+  return atoi(env);
+}
+
+}
+#endif
+
+// true device memory allocation, not visible from host
+void * rocm_device_allocate(int size)
+{
+  void * ptr;
+  hc::accelerator acc;
+  ptr = hc::am_alloc(size,acc,0);
+  return ptr;
+}
+
+// host pinned allocation
+// flag = 1, non-coherent, host resident, but with gpu address space pointer
+// flag = 2, coherent, host resident, but with host address space pointer
+void * rocm_hostpinned_allocate(int size)
+{
+  void * ptr;
+  hc::accelerator acc;
+  ptr = hc::am_alloc(size,acc,2);
+  return ptr;
+}
+// same free used by all rocm memory allocations
+void rocm_device_free(void * ptr)
+{
+  hc::am_free(ptr);
+}
+
+
+KOKKOS_INLINE_FUNCTION
+void rocm_device_synchronize()
+{
+   hc::accelerator_view av = hc::accelerator().get_default_view();
+   hc::completion_future fut = av.create_marker();
+   fut.wait();
+}
+
+void rocm_internal_error_throw( const char * name, const char * file, const int line )
+{
+#if 0
+  std::ostringstream out ;
+  out << name << " error( " << rocmGetErrorName(e) << "): " << rocmGetErrorString(e);
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw_runtime_exception( out.str() );
+#endif
+}
+
+//----------------------------------------------------------------------------
+// Some significant rocm device properties:
+//
+// rocmDeviceProp::name                : Text label for device
+// rocmDeviceProp::major               : Device major number
+// rocmDeviceProp::minor               : Device minor number
+// rocmDeviceProp::workgroupSize       : number of threads per workgroup
+// rocmDeviceProp::multiProcessorCount : number of multiprocessors
+// rocmDeviceProp::sharedMemPerBlock   : capacity of shared memory per wavefront
+// rocmDeviceProp::totalConstMem       : capacity of constant memory
+// rocmDeviceProp::totalGlobalMem      : capacity of global memory
+// rocmDeviceProp::maxGridSize[3]      : maximum grid size
+
+//
+//
+// the data we have available from a ROCm accelerator
+// std::wstring get_device_path()
+// std::wstring get_description()
+// unsigned int get_version()
+// bool get_has_display()
+// size_t get_dedicated_memory()
+// bool get_supports_double_precision()
+// bool get_supports_limited_double_precision()
+// bool get_is_debug()
+// bool get_supports_cpu_shared_memory()
+// size_t get_max_tile_static_size()
+// unsigned int get_cu_count()
+// bool has_cpu_accessible_am() 
+struct rocmDeviceProp {
+   char name[256];
+   char description[256];
+   unsigned int version;
+   int device_type;
+   int device_ordinal;
+   int major;
+   int minor;
+   size_t totalGlobalMem;
+   size_t sharedMemPerWavefront;
+   int WavefrontSize;
+   int WorkgroupSize;
+   int MaxTileCount;
+   int maxThreadsPerWorkgroup;
+   int multiProcessorCount;
+   int canMapHostMemory;
+   bool APU;
+};
+
+
+
+void rocmGetDeviceProperties(struct rocmDeviceProp* devProp, int device)
+{
+   std::wstring s;
+   int i,n;
+   hc::accelerator acc;
+   std::vector<hc::accelerator> accv = acc.get_all() ;
+
+   hc::accelerator a = accv[device];
+
+   s=a.get_device_path();
+   i = 0;
+   for(wchar_t c: s)
+      if((n=std::wctomb(&devProp->name[i],c))>0)
+         i+=n;
+
+   /* assume a CPU */
+   devProp->version = a.get_version();
+   devProp->major = a.get_version()>>16; // for CPU, these are meaningless 
+   devProp->minor = a.get_version()&0xff;
+   devProp->device_ordinal = 0;
+
+   /* is this an AMD graphics card */
+   if((devProp->name[0]=='g') && (devProp->name[1]=='f') 
+                              && (devProp->name[2]=='x')) {
+   /* for AMD cards, the name has the format gfxMmmO */
+     
+      devProp->device_type    = ((devProp->name[3]-0x30)<<16)
+                              + ((devProp->name[4]-0x30)<<8)
+                              +  (devProp->name[5]-0x30);
+      devProp->device_ordinal = devProp->name[6]-0x30;
+      devProp->major = devProp->name[3]-0x30;
+      devProp->minor = devProp->name[5]-0x30;
+   }
+
+   s=a.get_description();
+   i = 0;
+   for(wchar_t c: s)
+      if((n=std::wctomb(&devProp->description[i],c))>0)
+         i+=n;
+   devProp->totalGlobalMem = a.get_dedicated_memory();
+   devProp->sharedMemPerWavefront = a.get_max_tile_static_size();
+   devProp->WavefrontSize = 64;
+   devProp->WorkgroupSize = 256; // preferred
+   devProp->MaxTileCount  = 409600; // as defined in /opt/rocm/hcc-lc/include/hsa_new.h
+   devProp->maxThreadsPerWorkgroup = 1024;
+   devProp->multiProcessorCount = a.get_cu_count();
+   devProp->canMapHostMemory = a.get_supports_cpu_shared_memory();
+// Kaveri has 64KB L2 per CU, 16KB L1, 64KB Vector Regs/SIMD, or 128 regs/thread
+// GCN has 64KB LDS per CU
+
+//Kaveri APU is 7:0:0
+//Carrizo APU is 8:0:1
+   devProp->APU = (((devProp->major==7)&&(devProp->minor==0))|
+                   ((devProp->major==8)&&(devProp->minor==1)))?true:false;
+}
+
+namespace {
+
+
+
+class ROCmInternalDevices {
+public:
+  enum { MAXIMUM_DEVICE_COUNT = 64 };
+  struct rocmDeviceProp  m_rocmProp[ MAXIMUM_DEVICE_COUNT ] ;
+  int                    m_rocmDevCount ;
+
+  ROCmInternalDevices();
+
+  static const ROCmInternalDevices & singleton();
+};
+
+ROCmInternalDevices::ROCmInternalDevices()
+{
+   hc::accelerator acc;
+   std::vector<hc::accelerator> accv = acc.get_all() ;
+   m_rocmDevCount = accv.size();
+
+  if(m_rocmDevCount > MAXIMUM_DEVICE_COUNT) {
+    Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
+  }
+  for ( int i = 0 ; i < m_rocmDevCount ; ++i ) {
+    rocmGetDeviceProperties( m_rocmProp + i , i );
+  }
+}
+
+const ROCmInternalDevices & ROCmInternalDevices::singleton()
+{
+  static ROCmInternalDevices* self = nullptr;
+  if (!self) {
+    self = new ROCmInternalDevices();
+  }
+  return *self;
+
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+class ROCmInternal {
+private:
+
+  ROCmInternal( const ROCmInternal & );
+  ROCmInternal & operator = ( const ROCmInternal & );
+
+
+public:
+
+  typedef Kokkos::Experimental::ROCm::size_type size_type ;
+
+  int         m_rocmDev ;
+  int         m_rocmArch ;
+  unsigned    m_multiProcCount ;
+  unsigned    m_maxWorkgroup ;
+  unsigned    m_maxSharedWords ;
+  size_type   m_scratchSpaceCount ;
+  size_type   m_scratchFlagsCount ;
+  size_type * m_scratchSpace ;
+  size_type * m_scratchFlags ;
+
+  static int was_finalized;
+
+  static ROCmInternal & singleton();
+
+  int verify_is_initialized( const char * const label ) const ;
+
+  int is_initialized() const
+    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
+
+  void initialize( int rocm_device_id );
+  void finalize();
+
+  void print_configuration( std::ostream & ) const ;
+
+
+  ~ROCmInternal();
+
+  ROCmInternal()
+    : m_rocmDev( -1 )
+    , m_rocmArch( -1 )
+    , m_multiProcCount( 0 )
+    , m_maxWorkgroup( 0 )
+    , m_maxSharedWords( 0 )
+    , m_scratchSpaceCount( 0 )
+    , m_scratchFlagsCount( 0 )
+    , m_scratchSpace( 0 )
+    , m_scratchFlags( 0 )
+    {}
+
+  size_type * scratch_space( const size_type size );
+  size_type * scratch_flags( const size_type size );
+};
+
+int ROCmInternal::was_finalized = 0;
+//----------------------------------------------------------------------------
+
+
+void ROCmInternal::print_configuration( std::ostream & s ) const
+{
+  const ROCmInternalDevices & dev_info = ROCmInternalDevices::singleton();
+
+#if defined( KOKKOS_ENABLE_ROCM )
+    s << "macro  KOKKOS_ENABLE_ROCM      : defined" << std::endl ;
+#endif
+#if defined( __hcc_version__ )
+    s << "macro  __hcc_version__          = " << __hcc_version__
+      << std::endl ;
+#endif
+
+  for ( int i = 0 ; i < dev_info.m_rocmDevCount ; ++i ) {
+    s << "Kokkos::Experimental::ROCm[ " << i << " ] "
+      << dev_info.m_rocmProp[i].name
+      << " version " << (dev_info.m_rocmProp[i].major) << "." << dev_info.m_rocmProp[i].minor
+      << ", Total Global Memory: " << human_memory_size(dev_info.m_rocmProp[i].totalGlobalMem)
+      << ", Shared Memory per Wavefront: " << human_memory_size(dev_info.m_rocmProp[i].sharedMemPerWavefront);
+    if ( m_rocmDev == i ) s << " : Selected" ;
+    s << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+ROCmInternal::~ROCmInternal()
+{
+  if ( m_scratchSpace ||
+       m_scratchFlags ) {
+    std::cerr << "Kokkos::Experimental::ROCm ERROR: Failed to call Kokkos::Experimental::ROCm::finalize()"
+              << std::endl ;
+    std::cerr.flush();
+  }
+
+  m_rocmDev                 = -1 ;
+  m_rocmArch                = -1 ;
+  m_multiProcCount          = 0 ;
+  m_maxWorkgroup            = 0 ;
+  m_maxSharedWords          = 0 ;
+  m_scratchSpaceCount       = 0 ;
+  m_scratchFlagsCount       = 0 ;
+  m_scratchSpace            = 0 ;
+  m_scratchFlags            = 0 ;
+}
+
+int ROCmInternal::verify_is_initialized( const char * const label ) const
+{
+  if ( m_rocmDev < 0 ) {
+    std::cerr << "Kokkos::Experimental::ROCm::" << label << " : ERROR device not initialized" << std::endl ;
+  }
+  return 0 <= m_rocmDev ;
+}
+
+ROCmInternal & ROCmInternal::singleton()
+{
+  static ROCmInternal* self = nullptr ;
+  if (!self) {
+    self = new ROCmInternal();
+  }
+  return *self ;
+
+}
+
+void ROCmInternal::initialize( int rocm_device_id  )
+{
+  if ( was_finalized ) Kokkos::abort("Calling ROCm::initialize after ROCm::finalize is illegal\n");
+
+  if ( is_initialized() ) return;
+
+  enum { WordSize = sizeof(size_type) };
+
+  if ( ! HostSpace::execution_space::is_initialized() ) {
+    const std::string msg("ROCm::initialize ERROR : HostSpace::execution_space is not initialized");
+    throw_runtime_exception( msg );
+  }
+
+  const ROCmInternalDevices & dev_info = ROCmInternalDevices::singleton();
+
+  const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
+
+  const bool ok_id   = 1 <= rocm_device_id &&
+                            rocm_device_id < dev_info.m_rocmDevCount ;
+
+  // Need at least a GPU device
+
+  const bool ok_dev = ok_id &&
+    ( 1 <= dev_info.m_rocmProp[ rocm_device_id ].major &&
+      0 <= dev_info.m_rocmProp[ rocm_device_id ].minor );
+  if ( ok_init && ok_dev ) {
+
+    const struct rocmDeviceProp & rocmProp =
+      dev_info.m_rocmProp[ rocm_device_id ];
+
+    m_rocmDev = rocm_device_id ;
+
+//  rocmSetDevice( m_rocmDev ) );
+    Kokkos::Impl::rocm_device_synchronize();
+
+/*
+    // Query what compute capability architecture a kernel executes:
+    m_rocmArch = rocm_kernel_arch();
+    if ( m_rocmArch != rocmProp.major * 100 + rocmProp.minor * 10 ) {
+      std::cerr << "Kokkos::Experimental::ROCm::initialize WARNING: running kernels compiled for compute capability "
+                << ( m_rocmArch / 100 ) << "." << ( ( m_rocmArch % 100 ) / 10 )
+                << " on device with compute capability "
+                << rocmProp.major << "." << rocmProp.minor
+                << " , this will likely reduce potential performance."
+                << std::endl ;
+    }
+*/
+    // number of multiprocessors
+
+    m_multiProcCount = rocmProp.multiProcessorCount ;
+
+    //----------------------------------
+    // Maximum number of wavefronts,
+    // at most one workgroup per thread in a workgroup for reduction.
+
+
+    m_maxSharedWords = rocmProp.sharedMemPerWavefront/ WordSize ;
+
+    //----------------------------------
+    // Maximum number of Workgroups:
+
+    m_maxWorkgroup = 5*rocmProp.multiProcessorCount;  //TODO: confirm usage and value
+
+    //----------------------------------
+    // Multiblock reduction uses scratch flags for counters
+    // and scratch space for partial reduction values.
+    // Allocate some initial space.  This will grow as needed.
+
+    {
+      const unsigned reduce_block_count = m_maxWorkgroup * Impl::ROCmTraits::WorkgroupSize ;
+
+      (void) scratch_flags( reduce_block_count * 2  * sizeof(size_type) );
+      (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
+    }
+    //----------------------------------
+
+  }
+  else {
+
+    std::ostringstream msg ;
+    msg << "Kokkos::Experimental::ROCm::initialize(" << rocm_device_id << ") FAILED" ;
+
+    if ( ! ok_init ) {
+      msg << " : Already initialized" ;
+    }
+    if ( ! ok_id ) {
+      msg << " : Device identifier out of range "
+          << "[0.." << (dev_info.m_rocmDevCount-1) << "]" ;
+    }
+    else if ( ! ok_dev ) {
+      msg << " : Device " ;
+      msg << dev_info.m_rocmProp[ rocm_device_id ].major ;
+      msg << "." ;
+      msg << dev_info.m_rocmProp[ rocm_device_id ].minor ;
+      msg << " Need at least a GPU" ;
+      msg << std::endl;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+
+  // Init the array for used for arbitrarily sized atomics
+  Kokkos::Impl::init_lock_arrays_rocm_space();
+
+//  Kokkos::Impl::ROCmLockArraysStruct locks;
+//  locks.atomic = atomic_lock_array_rocm_space_ptr(false);
+//  locks.scratch = scratch_lock_array_rocm_space_ptr(false);
+//  locks.threadid = threadid_lock_array_rocm_space_ptr(false);
+//  rocmMemcpyToSymbol( kokkos_impl_rocm_lock_arrays , & locks , sizeof(ROCmLockArraysStruct) );
+}
+
+//----------------------------------------------------------------------------
+
+typedef Kokkos::Experimental::ROCm::size_type ScratchGrain[ Impl::ROCmTraits::WorkgroupSize ] ;
+enum { sizeScratchGrain = sizeof(ScratchGrain) };
+
+void rocmMemset(  Kokkos::Experimental::ROCm::size_type * ptr ,  Kokkos::Experimental::ROCm::size_type value , Kokkos::Experimental::ROCm::size_type size)
+{
+char * mptr = (char * ) ptr;
+#if 0
+   parallel_for_each(hc::extent<1>(size),
+                    [=, &ptr]
+                    (hc::index<1> idx) __HC__
+   {
+      int i = idx[0];
+      ptr[i] = value;
+   }).wait();
+#else
+   for (int i= 0; i<size ; i++)
+   {
+     mptr[i] = (char) value;
+   }
+#endif
+}
+
+Kokkos::Experimental::ROCm::size_type *
+ROCmInternal::scratch_flags( const Kokkos::Experimental::ROCm::size_type size )
+{
+  if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
+
+
+    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+
+    Record * const r = Record::allocate( Kokkos::HostSpace()
+                                       , "InternalScratchFlags"
+                                       , ( sizeScratchGrain  * m_scratchFlagsCount ) );
+
+    Record::increment( r );
+
+    m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
+
+    rocmMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain );
+  }
+
+  return m_scratchFlags ;
+}
+
+Kokkos::Experimental::ROCm::size_type *
+ROCmInternal::scratch_space( const Kokkos::Experimental::ROCm::size_type size )
+{
+  if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
+
+    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+     typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+
+     Record * const r = Record::allocate( Kokkos::HostSpace()
+                                        , "InternalScratchSpace"
+                                        , ( sizeScratchGrain  * m_scratchSpaceCount ) );
+
+     Record::increment( r );
+
+     m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
+  }
+
+  return m_scratchSpace ;
+}
+
+//----------------------------------------------------------------------------
+
+void ROCmInternal::finalize()
+{
+  Kokkos::Impl::rocm_device_synchronize();
+  was_finalized = 1;
+  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
+
+//    atomic_lock_array_rocm_space_ptr(false);
+//    scratch_lock_array_rocm_space_ptr(false);
+//    threadid_lock_array_rocm_space_ptr(false);
+
+    typedef Kokkos::Impl::SharedAllocationRecord< HostSpace > RecordROCm ;
+    typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace > RecordHost ;
+
+    RecordROCm::decrement( RecordROCm::get_record( m_scratchFlags ) );
+    RecordROCm::decrement( RecordROCm::get_record( m_scratchSpace ) );
+
+    m_rocmDev             = -1 ;
+    m_multiProcCount      = 0 ;
+    m_maxWorkgroup        = 0 ;
+    m_maxSharedWords      = 0 ;
+    m_scratchSpaceCount   = 0 ;
+    m_scratchFlagsCount   = 0 ;
+    m_scratchSpace        = 0 ;
+    m_scratchFlags        = 0 ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+Kokkos::Experimental::ROCm::size_type rocm_internal_cu_count()
+{ return ROCmInternal::singleton().m_multiProcCount ; }
+
+Kokkos::Experimental::ROCm::size_type rocm_internal_maximum_extent_size()
+{ return ROCmInternal::singleton().m_maxWorkgroup ; }
+
+Kokkos::Experimental::ROCm::size_type rocm_internal_maximum_shared_words()
+{ return ROCmInternal::singleton().m_maxSharedWords ; }
+
+Kokkos::Experimental::ROCm::size_type * rocm_internal_scratch_space( const Kokkos::Experimental::ROCm::size_type size )
+{ return ROCmInternal::singleton().scratch_space( size ); }
+
+Kokkos::Experimental::ROCm::size_type * rocm_internal_scratch_flags( const Kokkos::Experimental::ROCm::size_type size )
+{ return ROCmInternal::singleton().scratch_flags( size ); }
+
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+//ROCm::size_type ROCm::detect_device_count()
+//{ return Impl::ROCmInternalDevices::singleton().m_rocmDevCount ; }
+
+int ROCm::concurrency() {
+#if defined(KOKKOS_ARCH_KAVERI) 
+  return 8*64*40;  // 20480 kaveri
+#else
+  return 32*8*40;  // 81920 fiji and hawaii
+#endif
+}
+int ROCm::is_initialized()
+{ return Kokkos::Impl::ROCmInternal::singleton().is_initialized(); }
+
+void ROCm::initialize( const ROCm::SelectDevice config )
+{
+  Kokkos::Impl::ROCmInternal::singleton().initialize( config.rocm_device_id );
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
+
+#if 0
+std::vector<unsigned>
+ROCm::detect_device_arch()
+{
+  const Impl::ROCmInternalDevices & s = Impl::ROCmInternalDevices::singleton();
+
+  std::vector<unsigned> output( s.m_rocmDevCount );
+
+  for ( int i = 0 ; i < s.m_rocmDevCount ; ++i ) {
+    output[i] = s.m_rocmProp[i].major * 100 + s.m_rocmProp[i].minor ;
+  }
+
+  return output ;
+}
+
+ROCm::size_type ROCm::device_arch()
+{
+  return 1 ;
+}
+#endif
+
+void ROCm::finalize()
+{
+  Kokkos::Impl::ROCmInternal::singleton().finalize();
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
+}
+
+ROCm::ROCm()
+  : m_device( Kokkos::Impl::ROCmInternal::singleton().m_rocmDev )
+{
+  Kokkos::Impl::ROCmInternal::singleton().verify_is_initialized( "ROCm instance constructor" );
+}
+
+bool ROCm::isAPU(int device) {
+  const Kokkos::Impl::ROCmInternalDevices & dev_info = 
+              Kokkos::Impl::ROCmInternalDevices::singleton();
+  return (dev_info.m_rocmProp[device].APU);  
+}
+
+bool ROCm::isAPU() {
+  return ROCm::isAPU(rocm_device());  
+}
+
+//ROCm::ROCm( const int instance_id )
+//  : m_device( Impl::ROCmInternal::singleton().m_rocmDev )
+//{}
+
+void ROCm::print_configuration( std::ostream & s , const bool )
+{ Kokkos::Impl::ROCmInternal::singleton().print_configuration( s ); }
+
+bool ROCm::sleep() { return false ; }
+
+bool ROCm::wake() { return true ; }
+
+void ROCm::fence()
+{
+  Kokkos::Impl::rocm_device_synchronize();
+}
+
+const char* ROCm::name() { return "ROCm"; }
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif // KOKKOS_ENABLE_ROCM
+//----------------------------------------------------------------------------
+
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b18e5f61f17f8cc62610b6eff11e35e8a460960d
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp
@@ -0,0 +1,138 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <type_traits>
+#include <Kokkos_Macros.hpp>
+
+#if !defined( KOKKOS_ROCM_INVOKE_H )
+#define KOKKOS_ROCM_INVOKE_H
+
+namespace Kokkos {
+namespace Impl {
+
+template<class Tag, class F, class... Ts, typename std::enable_if<(!std::is_void<Tag>()), int>::type = 0>
+KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs)
+{
+  f(Tag(), static_cast<Ts&&>(xs)...);
+}
+
+template<class Tag, class F, class... Ts, typename std::enable_if<(std::is_void<Tag>()), int>::type = 0>
+KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs)
+{
+  f(static_cast<Ts&&>(xs)...);
+}
+
+
+template<class F, class Tag=void>
+struct rocm_invoke_fn
+{
+    F* f;
+    rocm_invoke_fn(F& f_) : f(&f_)
+    {}
+
+    template<class... Ts>
+    KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const
+    {
+        rocm_invoke<Tag>(*f, static_cast<Ts&&>(xs)...);
+    }
+};
+
+template<class Tag, class F>
+KOKKOS_INLINE_FUNCTION rocm_invoke_fn<F, Tag> make_rocm_invoke_fn(F& f)
+{
+    return {f};
+}
+
+template<class T>
+KOKKOS_INLINE_FUNCTION T& rocm_unwrap(T& x)
+{
+    return x;
+}
+
+template<class T>
+KOKKOS_INLINE_FUNCTION T& rocm_unwrap(std::reference_wrapper<T> x)
+{
+    return x;
+}
+
+template<class F, class T>
+struct rocm_capture_fn
+{
+    F f;
+    T data;
+
+    KOKKOS_INLINE_FUNCTION rocm_capture_fn(F f_, T x) 
+    : f(f_), data(x)
+    {}
+
+    template<class... Ts>
+    KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const
+    {
+        f(rocm_unwrap(data), static_cast<Ts&&>(xs)...);
+    }
+};
+
+template<class F, class T>
+KOKKOS_INLINE_FUNCTION rocm_capture_fn<F, T> rocm_capture(F f, T x)
+{
+    return {f, x};
+}
+
+template<class F, class T, class U, class... Ts>
+KOKKOS_INLINE_FUNCTION auto rocm_capture(F f, T x, U y, Ts... xs) -> decltype(rocm_capture(rocm_capture(f, x), y, xs...))
+{
+    return rocm_capture(rocm_capture(f, x), y, xs...);
+}
+
+struct rocm_apply_op
+{
+    template<class F, class... Ts>
+    KOKKOS_INLINE_FUNCTION void operator()(F&& f, Ts&&... xs) const
+    {
+        f(static_cast<Ts&&>(xs)...);
+    }
+};
+
+}}
+
+#endif
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Join.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Join.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..eccba5119a2aec0917d12d8d94234993b480f19c
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Join.hpp
@@ -0,0 +1,72 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if !defined( KOKKOS_ROCM_JOIN_H )
+#define KOKKOS_ROCM_JOIN_H
+
+namespace Kokkos {
+namespace Impl {
+
+
+// Adaptor to use ValueJoin with standard algorithms
+template<class Joiner, class F>
+struct join_operator
+{
+  const F* fp;
+  template<class T, class U>
+  T operator()(T x, const U& y) const
+  {
+    Joiner::join(*fp, &x, &y);
+    return x;
+  }
+};
+
+template<class Joiner, class F>
+join_operator<Joiner, F> make_join_operator(const F& f)
+{
+  return join_operator<Joiner, F>{&f};
+}
+
+}}
+
+#endif
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ca9b149a61da248163f8b420c53f64a6840e80f
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
@@ -0,0 +1,1544 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <algorithm>
+#include <typeinfo>
+#include <ROCm/Kokkos_ROCm_Reduce.hpp>
+#include <ROCm/Kokkos_ROCm_Scan.hpp>
+#include <ROCm/Kokkos_ROCm_Exec.hpp>
+#include <ROCm/Kokkos_ROCm_Vectorization.hpp>
+#include <ROCm/KokkosExp_ROCm_IterateTile_Refactor.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+
+namespace Kokkos {
+namespace Impl {
+
+struct ROCmTeamMember ;
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Experimental::ROCm, Properties ... >: public PolicyTraits<Properties ...> {
+private:
+  int m_league_size ;
+  int m_team_size ;
+  int m_vector_length ;
+  int m_team_scratch_size[2] ;
+  int m_thread_scratch_size[2] ;
+  int m_chunk_size ;
+
+
+public:
+
+  using execution_policy = TeamPolicyInternal ;
+  using execution_space  = Kokkos::Experimental::ROCm ;
+  typedef PolicyTraits<Properties ... > traits;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_size = p.m_team_size;
+    m_vector_length = p.m_vector_length;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  TeamPolicyInternal()
+    : m_league_size( 0 )
+    , m_team_size( 0 )
+    , m_vector_length( 0 )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 64 )
+   {}
+
+  TeamPolicyInternal( const int arg_league_size
+            , const int arg_team_size )
+    : m_league_size( arg_league_size ),
+      m_team_size( arg_team_size )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 64 )
+    {}
+
+  TeamPolicyInternal( const int arg_league_size
+            , const int arg_team_size
+            , const int vector_length_request=1)
+    : m_league_size( arg_league_size ),
+      m_team_size( arg_team_size ),
+      m_vector_length (vector_length_request)
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 64 )
+    {}
+
+  TeamPolicyInternal( const int arg_league_size
+            , const Kokkos::AUTO_t )
+    : m_league_size( arg_league_size ), m_team_size( -1 )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 64 )
+    {}
+
+  TeamPolicyInternal( const int arg_league_size
+            , const Kokkos::AUTO_t
+            , const int vector_length_request)
+    : m_league_size( arg_league_size ),
+      m_team_size( -1 ),
+      m_vector_length (vector_length_request)
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 64 )
+    {}
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  KOKKOS_INLINE_FUNCTION TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+protected:
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal internal_set_chunk_size(typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerThreadValue& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) {
+    m_team_scratch_size[level] = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+
+public:
+// TODO:  evaluate proper team_size_max requirements
+  template< class Functor_Type>
+  KOKKOS_INLINE_FUNCTION static
+  int team_size_max( const Functor_Type & functor)
+  {
+    typedef typename Kokkos::Impl::FunctorValueTraits<Functor_Type, void>::value_type value_type;
+    return team_size_recommended(functor);
+    // return std::min(Kokkos::Impl::get_max_tile_size() / sizeof(value_type), Kokkos::Impl::get_max_tile_thread());
+  }
+
+  template< class Functor_Type>
+  KOKKOS_INLINE_FUNCTION static int team_size_recommended(const Functor_Type & functor)
+  { return Kokkos::Impl::get_tile_size<typename Kokkos::Impl::FunctorValueTraits<Functor_Type, void>::value_type>(); }
+
+  template< class Functor_Type >
+  KOKKOS_INLINE_FUNCTION static int team_size_recommended(const Functor_Type &functor, const int vector_length)
+ {
+   int max = team_size_recommended( functor )/vector_length;
+   if(max < 1) max = 1;
+   return(max);
+ }
+
+  template<class F>
+  KOKKOS_INLINE_FUNCTION int team_size(const F& f) const { return (m_team_size > 0) ? m_team_size : team_size_recommended(f); }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return (m_team_size > 0) ? m_team_size : Impl::get_max_tile_thread(); ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+
+
+  inline int vector_length()   const { return m_vector_length ; }
+  inline int scratch_size(int level, int team_size_ = -1) const {
+    if(team_size_<0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level];
+  }
+  inline size_t team_scratch_size(int level) const {
+    return m_team_scratch_size[level];
+  }
+  inline size_t thread_scratch_size(int level) const {
+    return m_thread_scratch_size[level];
+  }
+
+  typedef Impl::ROCmTeamMember member_type;
+};
+
+  struct ROCmTeamMember {
+    typedef Kokkos::Experimental::ROCm                             execution_space ;
+    typedef Kokkos::ScratchMemorySpace<Kokkos::Experimental::ROCm> scratch_memory_space ;
+
+    KOKKOS_INLINE_FUNCTION
+    const scratch_memory_space & team_shmem() const 
+      { return m_team_shared.set_team_thread_mode(0,1,0); }
+    KOKKOS_INLINE_FUNCTION
+    const execution_space::scratch_memory_space & team_scratch(const int& level) const
+      { return m_team_shared.set_team_thread_mode(level,1,0) ; }
+    KOKKOS_INLINE_FUNCTION
+    const execution_space::scratch_memory_space & thread_scratch(const int& level) const
+      { return m_team_shared.set_team_thread_mode(level,
+                                             team_size(),
+                                             team_rank()) ; }
+
+
+    /* Rank of this team within the league of teams */
+    KOKKOS_INLINE_FUNCTION int league_rank() const { return m_idx.tile[0]; }
+    /* Number of teams in the league */
+    KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
+    /* Rank of this thread within this team */
+    KOKKOS_INLINE_FUNCTION int team_rank() const { return m_idx.local[0] / m_vector_length; }
+    /* Rank of this thread within this thread */
+    KOKKOS_INLINE_FUNCTION int vector_rank() const { return m_idx.local[0] % m_vector_length; }
+    KOKKOS_INLINE_FUNCTION int lindex() const { return m_idx.local[0]; }
+    KOKKOS_INLINE_FUNCTION int gindex() const { return m_idx.global[0]; }
+    KOKKOS_INLINE_FUNCTION int tindex() const { return m_idx.tile[0]; }
+    KOKKOS_INLINE_FUNCTION int tile_dim() const { return m_idx.tile_dim[0]; }
+    KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
+    KOKKOS_INLINE_FUNCTION int vector_length() const { return m_vector_length; }
+
+
+    KOKKOS_INLINE_FUNCTION
+    ROCmTeamMember( const hc::tiled_index< 1 > & arg_idx, int league_size_,int team_size_ )
+      : m_league_size( league_size_ )
+      , m_team_size( team_size_ )
+      , m_team_shared( nullptr, 0 )
+      , m_vector_length( 1 )
+      , m_idx( arg_idx )
+      {}
+
+    KOKKOS_INLINE_FUNCTION
+    ROCmTeamMember( const hc::tiled_index< 1 > & arg_idx, int league_size_,int team_size_, char * shared,  std::size_t shsize, std::size_t scratch_size0, char * scratch_ptr, std::size_t scratch_size1, std::size_t vector_length)
+      : m_league_size( league_size_ )
+      , m_team_size( team_size_ )
+      , m_team_shared( shared +  
+                          arg_idx.tile[0]*(shsize+scratch_size0), 
+                       (shsize+scratch_size0)*league_size_, 
+                       scratch_ptr + arg_idx.tile[0]*scratch_size1, 
+                       scratch_size1*league_size_)
+      , m_vector_length( vector_length )
+      , m_idx( arg_idx )
+      {}
+
+    KOKKOS_INLINE_FUNCTION
+    void team_barrier() const {
+      m_idx.barrier.wait();
+    }
+
+    template<class ValueType>
+    KOKKOS_INLINE_FUNCTION
+    void team_broadcast(const ValueType& value, const int& thread_id ) const 
+    {
+      static_assert(std::is_trivially_default_constructible<ValueType>(), "Only trivial constructible types can be broadcasted");
+      tile_static ValueType local_value;
+      zero_init(local_value);
+      if (this->team_rank() == thread_id) {
+        local_value = value;
+      }
+      this->team_barrier();
+      value = local_value;
+    }
+// Reduce across a team of threads.
+//
+// Each thread has vector_length elements.
+// This reduction is for TeamThreadRange operations, where the range
+// is spread across threads.  Effectively, there are vector_length
+// independent reduction operations.
+// This is different from a reduction across the elements of a thread,
+// which reduces every vector element.
+
+    template< class ValueType, class JoinOp >
+    KOKKOS_INLINE_FUNCTION
+    ValueType team_reduce( const ValueType & value , const JoinOp & op_in) const
+    {
+      typedef JoinLambdaAdapter<ValueType,JoinOp> JoinOpFunctor ;
+      const JoinOpFunctor op(op_in);
+
+      tile_static ValueType buffer[512];
+      const auto local = lindex();
+      const auto team  = team_rank();
+      auto vector_rank = local%m_vector_length;
+      auto thread_base = team*m_vector_length;
+
+      const std::size_t size = next_pow_2(m_team_size+1)/2;
+#if defined(ROCM15)
+      buffer[local] = value;
+#else
+        // ROCM 1.5 handles address spaces better, previous version didn't
+      lds_for(buffer[local], [&](ValueType& x)
+      {
+          x = value;
+      });
+#endif
+      m_idx.barrier.wait();
+
+      for(std::size_t s = 1; s < size; s *= 2)
+      {
+          const std::size_t index = 2 * s * team;
+          if (index < size)
+          {
+#if defined(ROCM15)
+                op.join(buffer[vector_rank+index*m_vector_length],
+                        buffer[vector_rank+(index+s)*m_vector_length]);
+#else
+              lds_for(buffer[vector_rank+index*m_vector_length], [&](ValueType& x)
+              {
+                  lds_for(buffer[vector_rank+(index+s)*m_vector_length],
+                                [&](ValueType& y)
+                  {
+                      op.join(x, y);
+                  });
+              });
+#endif
+          }
+          m_idx.barrier.wait();
+      }
+
+      if (local == 0)
+      {
+          for(int i=size*m_vector_length; i<m_team_size*m_vector_length; i+=m_vector_length)
+#if defined(ROCM15)
+              op.join(buffer[vector_rank], buffer[vector_rank+i]);
+#else
+              lds_for(buffer[vector_rank], [&](ValueType& x)
+              {
+                  lds_for(buffer[vector_rank+i],
+                                [&](ValueType& y)
+                  {
+                      op.join(x, y);
+                  });
+              });
+#endif
+      }
+      m_idx.barrier.wait();
+
+      return buffer[0];
+    }
+
+// Reduce across a team of threads, with a reducer data type
+//
+// Each thread has vector_length elements.
+// This reduction is for TeamThreadRange operations, where the range
+// is spread across threads.  Effectively, there are vector_length
+// independent reduction operations.
+// This is different from a reduction across the elements of a thread,
+// which reduces every vector element.
+
+    template< class ReducerType >
+    KOKKOS_INLINE_FUNCTION
+    typename std::enable_if< is_reducer< ReducerType >::value >::type
+    team_reduce( const ReducerType & reducer) const
+    {
+      typedef typename ReducerType::value_type value_type ;
+
+      tile_static value_type buffer[512];
+      const auto local = lindex();
+      const auto team  = team_rank();
+      auto vector_rank = local%m_vector_length;
+      auto thread_base = team*m_vector_length;
+
+      const std::size_t size = next_pow_2(m_team_size+1)/2;
+#if defined(ROCM15)
+      buffer[local] = reducer.reference();
+#else
+        // ROCM 1.5 handles address spaces better, previous version didn't
+      lds_for(buffer[local], [&](ValueType& x)
+      {
+          x = value;
+      });
+#endif
+      m_idx.barrier.wait();
+
+      for(std::size_t s = 1; s < size; s *= 2)
+      {
+          const std::size_t index = 2 * s * team;
+          if (index < size)
+          {
+#if defined(ROCM15)
+                reducer.join(buffer[vector_rank+index*m_vector_length],
+                        buffer[vector_rank+(index+s)*m_vector_length]);
+#else
+              lds_for(buffer[vector_rank+index*m_vector_length], [&](ValueType& x)
+              {
+                  lds_for(buffer[vector_rank+(index+s)*m_vector_length],
+                                [&](ValueType& y)
+                  {
+                      reducer.join(x, y);
+                  });
+              });
+#endif
+          }
+          m_idx.barrier.wait();
+      }
+
+      if (local == 0)
+      {
+          for(int i=size*m_vector_length; i<m_team_size*m_vector_length; i+=m_vector_length)
+#if defined(ROCM15)
+              reducer.join(buffer[vector_rank], buffer[vector_rank+i]);
+#else
+              lds_for(buffer[vector_rank], [&](ValueType& x)
+              {
+                  lds_for(buffer[vector_rank+i],
+                                [&](ValueType& y)
+                  {
+                      reducer.join(x, y);
+                  });
+              });
+#endif
+      }
+      m_idx.barrier.wait();
+    }
+
+    /** \brief  Intra-team vector reduce 
+     *          with intra-team non-deterministic ordering accumulation.
+     *
+     *  The intra-team accumulation value will, at the end of the
+     *  league's parallel execution, be the reduction's total.
+     *  Parallel execution ordering of the league's teams is non-deterministic.
+     *  As such the base value for each team's vector reduce operation is
+     *  similarly non-deterministic.
+     */
+    template< class ValueType, class JoinOp >
+    KOKKOS_INLINE_FUNCTION
+    ValueType thread_reduce( const ValueType & value , const JoinOp & op_in) const
+    {
+      typedef JoinLambdaAdapter<ValueType,JoinOp> JoinOpFunctor ;
+      const JoinOpFunctor op(op_in);
+
+      const auto local = m_idx.local[0];
+      tile_static ValueType buffer[512];
+      const std::size_t size = m_vector_length; //vector length must be power of 2
+      auto vector_rank = local%m_vector_length;
+      auto thread_base = team_rank()*m_vector_length;
+      lds_for(buffer[local], [&](ValueType& x)
+      {
+          x = value;
+      });
+      m_idx.barrier.wait();
+      for(std::size_t s = 1; s < size; s *= 2)
+      {
+          const std::size_t index = 2 * s * vector_rank;
+          if (index < size)
+          {
+#if defined(ROCM15)
+              op.join(buffer[thread_base+index], buffer[thread_base+index+s]);
+#else
+
+              lds_for(buffer[thread_base+index], [&](ValueType& x)
+              {
+                  lds_for(buffer[thread_base+index+s], [&](ValueType& y)
+                  {
+                      op.join(x, y);
+                  });
+              });
+#endif
+          }
+          m_idx.barrier.wait();
+      }
+
+      m_idx.barrier.wait();
+      return buffer[thread_base];
+    }
+
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  vector_reduce( ReducerType const & reducer )
+    {
+      #ifdef __HCC_ACCELERATOR__
+      if(blockDim_x == 1) return;
+
+      // Intra vector lane shuffle reduction:
+      typename ReducerType::value_type tmp ( reducer.reference() );
+
+      for ( int i = blockDim_x ; ( i >>= 1 ) ; ) {
+        shfl_down( reducer.reference() , i , blockDim_x );
+        if ( (int)threadIdx_x < i ) { reducer.join( tmp , reducer.reference() ); }
+      }
+
+      // Broadcast from root lane to all other lanes.
+      // Cannot use "butterfly" algorithm to avoid the broadcast
+      // because floating point summation is not associative
+      // and thus different threads could have different results.
+
+      shfl( reducer.reference() , 0 , blockDim_x );
+      #endif
+    }
+
+
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+     *          with intra-team non-deterministic ordering accumulation.
+     *
+     *  The global inter-team accumulation value will, at the end of the
+     *  league's parallel execution, be the scan's total.
+     *  Parallel execution ordering of the league's teams is non-deterministic.
+     *  As such the base value for each team's scan operation is similarly
+     *  non-deterministic.
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum = nullptr ) const
+    {
+  #if 0
+      const auto local = m_idx.local[0];
+      const auto last = m_team_size - 1;
+      const auto init = 0;
+      tile_static Type buffer[256];
+
+      if (local == last) buffer[0] = init;
+      else buffer[local] = value;
+
+      m_idx.barrier.wait();
+
+      for(std::size_t s = 1; s < m_team_size; s *= 2)
+      {
+          if (local >= s) buffer[local] += buffer[local - s];
+          m_idx.barrier.wait();
+      }
+
+      if ( global_accum )
+      { 
+         if(local == last)
+         {
+            atomic_fetch_add(global_accum, buffer[local] + value);
+         }
+         m_idx.barrier.wait();
+         buffer[local] += *global_accum;
+      }
+      m_idx.barrier.wait();
+      return buffer[local];
+#else
+      tile_static Type sarray[2][256+1];
+      int lid = m_idx.local[0];
+      int lp1 = lid+1;
+
+      int toggle = 1;
+      int _toggle = 0;
+      m_idx.barrier.wait();
+
+      if(lid == 0) 
+      {
+         sarray[1][0] = 0;
+         sarray[0][0] = 0;
+      }
+      sarray[1][lp1] = value;
+
+      m_idx.barrier.wait();
+      for(int stride = 1; stride < m_team_size; stride*=2)
+      {
+         if(lid >= stride)
+         {
+            sarray[_toggle][lp1] =
+                          sarray[toggle][lp1]+sarray[toggle][lp1-stride];
+         }
+         else
+         {
+            sarray[_toggle][lp1] = sarray[toggle][lp1];
+         }
+         toggle = _toggle;
+         _toggle = 1-toggle;
+         m_idx.barrier.wait();
+      }
+
+      if ( global_accum )
+      { 
+         if(m_team_size == lp1)
+         {
+            sarray[toggle][m_team_size] = atomic_fetch_add(global_accum,sarray[toggle][m_team_size]);
+         }
+         m_idx.barrier.wait();
+         sarray[toggle][lid] += sarray[toggle][m_team_size];
+      }
+      m_idx.barrier.wait();
+      return sarray[toggle][lid];
+#endif
+    }
+
+  private:
+    int m_league_size ;
+    int m_team_size ;
+    const scratch_memory_space  m_team_shared;
+
+  public:
+    int m_vector_length;
+    hc::tiled_index<1> m_idx;
+  };
+}
+} // namespace Kokkos
+#include <ROCm/Kokkos_ROCm_ReduceScan.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Traits... >, Kokkos::Experimental::ROCm >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits... > Policy ;
+
+public:
+
+  inline
+  ParallelFor( const FunctorType & f
+             , const Policy      & policy )
+    {
+
+
+      const auto len = policy.end()-policy.begin();
+      const auto offset = policy.begin();
+      if(len == 0) return;
+// define a lambda to work around a compiler issue.  The compiler does not
+// properly dereference f inside the pfe.
+auto foo = [=](size_t i){rocm_invoke<typename Policy::work_tag>(f, i);};
+
+#if __hcc_workweek__ > 16600
+      hc::parallel_for_each(hc::extent<1>(len) , [=](const hc::index<1> & idx) [[hc]]  [[hc_max_workgroup_dim(1024,1,1)]]
+#else
+      hc::parallel_for_each(hc::extent<1>(len).tile(256) , [=](const hc::index<1> & idx) [[hc]]
+#endif
+      {
+        if(idx[0]<len)  // workaround for Carrizo (and Fiji?)
+          foo(idx[0] + offset);
+      }).wait();
+
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void execute() const {}
+
+};
+
+// MDRangePolicy impl
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::MDRangePolicy< Traits ... >
+                 , Kokkos::Experimental::ROCm
+                 >
+{
+private:
+  typedef Kokkos::MDRangePolicy< Traits ...  > Policy ;
+  using RP = Policy;
+  typedef typename Policy::array_index_type array_index_type;
+  typedef typename Policy::index_type index_type;
+  typedef typename Policy::launch_bounds LaunchBounds;
+
+
+  const FunctorType m_functor ;
+  const Policy      m_rp ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION 
+  void operator()(void) const
+    {
+       Kokkos::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
+    }
+
+
+  inline
+  void execute() const
+  {
+    const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::ROCmTraits::UpperBoundExtentCount);
+    if ( RP::rank == 2 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
+      const dim3 grid(
+            std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+          , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+          , 1 );
+      ROCmParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, 0);
+    }
+    else if ( RP::rank == 3 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
+      const dim3 grid(
+          std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+        , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks ));
+      ROCmParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, 0);
+    }
+    else if ( RP::rank == 4 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
+      const dim3 grid(
+          std::min(  m_rp.m_tile_end[0] * m_rp.m_tile_end[1] , maxblocks )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks ));
+      ROCmParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, 0);
+    }
+    else if ( RP::rank == 5 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
+      const dim3 grid(
+          std::min( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] , maxblocks )
+        , std::min( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] , maxblocks )
+        , std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks ));
+      ROCmParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, 0);
+    }
+    else if ( RP::rank == 6 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
+      const dim3 grid(
+          std::min( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] , maxblocks )
+        ,  std::min( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] , maxblocks )
+        , std::min( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] , maxblocks ));
+      ROCmParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, 0);
+    }
+    else
+    {
+      printf("Kokkos::MDRange Error: Exceeded rank bounds with ROCm\n");
+      Kokkos::abort("Aborting");
+    }
+
+  } //end execute
+
+//  inline
+  ParallelFor( const FunctorType & arg_functor
+             , Policy arg_policy )
+    : m_functor( arg_functor )
+    , m_rp(  arg_policy )
+    {
+}
+};
+
+//----------------------------------------------------------------------------
+
+template< class F , class... Traits >
+class ParallelFor< F
+                 , Kokkos::TeamPolicy< Traits... >
+                 , Kokkos::Experimental::ROCm >
+{
+  using Policy = Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::ROCm, Traits... >;
+  typedef Kokkos::Impl::FunctorValueTraits<F, typename Policy::work_tag> ValueTraits;
+
+public:
+  inline
+  ParallelFor( const F & f
+             , const Policy      & policy )
+    {
+      const auto league_size  = policy.league_size();
+      const auto team_size    = policy.team_size();
+      const int vector_length = policy.vector_length();
+      const auto total_size   = league_size * team_size * vector_length;
+      const int scratch_size0 = policy.scratch_size(0,team_size);
+      const int scratch_size1 = policy.scratch_size(1,team_size);
+
+      if(total_size == 0) return;
+
+      const auto shared_size = FunctorTeamShmemSize< F >::value( f , team_size );
+      char * scratch = NULL;
+      char * shared = (char *)rocm_device_allocate(shared_size * league_size +
+                                                   scratch_size0*league_size);
+      if(0<scratch_size1)
+        scratch = (char *)rocm_device_allocate(scratch_size1*league_size);
+
+      hc::extent< 1 > flat_extent( total_size );
+
+      hc::tiled_extent< 1 > team_extent = flat_extent.tile(team_size*vector_length);
+      hc::parallel_for_each( team_extent , [=](hc::tiled_index<1> idx) [[hc]]
+      {
+        rocm_invoke<typename Policy::work_tag>(f, typename Policy::member_type(idx, league_size, team_size, shared, shared_size, scratch_size0, scratch, scratch_size1,vector_length));
+      }).wait();
+
+      if(0<scratch_size1)
+        rocm_device_free(scratch);
+      rocm_device_free(shared);
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void execute() const {}
+
+};
+
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ReducerType, class... Traits >
+class ParallelReduce<
+  FunctorType , Kokkos::RangePolicy< Traits... >, ReducerType, Kokkos::Experimental::ROCm >
+{
+public:
+
+  typedef Kokkos::RangePolicy< Traits... > Policy ;
+
+  // TODO: Use generic lambdas instead
+  struct invoke_fn
+  {
+    template<class F, class... Ts>
+    KOKKOS_INLINE_FUNCTION void operator()(std::size_t size, F&& f, hc::tiled_index<1> idx, tile_desc td, Ts&&... xs) const
+    {
+      auto global = idx.global[0];
+      if (global < size) f(idx.global[0], static_cast<Ts&&>(xs)...);
+    }
+  };
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType  & f,
+                  const Policy       & policy,
+                  const ViewType & result_view,
+                  typename std::enable_if<
+                               Kokkos::is_view< ViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    {
+      typedef typename Policy::work_tag Tag;
+      typedef Kokkos::Impl::FunctorValueTraits< FunctorType , Tag > ValueTraits;
+      typedef Kokkos::Impl::FunctorValueInit< FunctorType , Tag > ValueInit;
+      typedef typename ValueTraits::reference_type reference_type;
+
+      const auto total_size = policy.end() - policy.begin();
+
+      if(total_size==0) {
+        if (result_view.data()) {
+           ValueInit::init( f , result_view.data() );
+        }
+        return;
+      }
+
+      Kokkos::Impl::reduce_enqueue< Tag >
+        ( total_size 
+        , f
+        , InvalidType{}
+        , rocm_capture(invoke_fn{}, total_size)
+        , result_view.data()
+        , result_view.extent(0)
+        );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & f,
+                  Policy       policy,
+                  const ReducerType& reducer )
+  {
+      typedef typename Policy::work_tag Tag;
+
+      typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value,                                   FunctorType, ReducerType> ReducerConditional;
+      typedef typename ReducerConditional::type ReducerTypeFwd;
+      typedef Kokkos::Impl::FunctorValueTraits< FunctorType , Tag > ValueTraits;
+      typedef Kokkos::Impl::FunctorValueInit< ReducerType, Tag > ValueInit ;
+
+      typedef typename ValueTraits::reference_type reference_type;
+
+      const auto total_size = policy.end() - policy.begin();
+
+      if(total_size==0) {
+        if (reducer.view().data()) {
+           ValueInit::init( ReducerConditional::select(f,reducer), 
+                            reducer.view().data() );
+        }
+        return;
+      }
+
+      Kokkos::Impl::reduce_enqueue< Tag >
+        ( total_size 
+        , f
+        , reducer
+        , rocm_capture(invoke_fn{}, total_size)
+        , reducer.view().data()
+        , reducer.view().extent(0)
+        );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void execute() const {}
+
+};
+
+template< class FunctorType, class ReducerType, class... Traits >
+class ParallelReduce<
+   FunctorType , Kokkos::TeamPolicy< Traits... >, ReducerType, Kokkos::Experimental::ROCm >
+{
+  using Policy = Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::ROCm, Traits... >;
+  typedef Kokkos::Impl::FunctorValueTraits<FunctorType, typename Policy::work_tag> ValueTraits;
+
+public:
+
+  struct invoke_fn
+  {
+    template<class Create, class F, class... Ts>
+    KOKKOS_INLINE_FUNCTION void operator()(Create&& create, F&& f, hc::tiled_index<1> idx, tile_desc td, Ts&&... xs) const
+    {
+      f(create(idx, td), static_cast<Ts&&>(xs)...);
+    }
+  };
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType  & f,
+                  const Policy       & policy,
+                  const ViewType     & result_view,
+                typename std::enable_if<
+                  Kokkos::is_view< ViewType >::value &&
+                  !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    {
+      const int league_size = policy.league_size();
+      const int team_size = policy.team_size(f);
+      const int vector_length = policy.vector_length();
+      const int scratch_size0 = policy.scratch_size(0,team_size);
+      const int scratch_size1 = policy.scratch_size(1,team_size);
+      const int total_size = league_size * team_size ;
+
+      if(total_size == 0) return;
+
+      const int reduce_size = ValueTraits::value_size( f );
+      const int shared_size = FunctorTeamShmemSize< FunctorType >::value( f , team_size );
+
+      char * shared;
+      char * scratch = NULL;
+
+      shared = (char *)rocm_device_allocate(league_size *
+                             (shared_size + scratch_size0));
+      if(0<scratch_size1)
+        scratch = (char *)rocm_device_allocate(scratch_size1 * league_size);
+
+      auto create_team_member = [=](hc::tiled_index<1> idx, tile_desc td) 
+      { 
+
+        return typename Policy::member_type(idx, league_size, td.team_size, 
+                                          shared, shared_size, scratch_size0,
+                                          scratch, scratch_size1, 
+                                          vector_length); 
+      };
+
+      Kokkos::Impl::reduce_enqueue< typename Policy::work_tag >
+      ( total_size*vector_length
+        , f
+        , InvalidType{}
+        , rocm_capture(invoke_fn{}, create_team_member)
+        , result_view.ptr_on_device()
+        , result_view.dimension_0()
+        , team_size 
+        , vector_length 
+        , shared_size
+      );
+
+      if(0<scratch_size1)
+        rocm_device_free(scratch);
+      rocm_device_free(shared);
+    }
+
+  inline
+  ParallelReduce( const FunctorType & f,
+                  Policy       policy,
+                  const ReducerType& reducer )
+  {
+    const int league_size = policy.league_size();
+      const int team_size = policy.team_size(f);
+      const int vector_length = policy.vector_length();
+      const int total_size = league_size * team_size;
+
+      if(total_size == 0) return;
+
+      const int reduce_size = ValueTraits::value_size( f );
+      const int shared_size = FunctorTeamShmemSize< FunctorType >::value( f , team_size );
+      const int scratch_size0 = policy.scratch_size(0,team_size);
+      const int scratch_size1 = policy.scratch_size(1,team_size);
+
+      char * shared;
+      char * scratch = NULL;
+      shared = (char *)rocm_device_allocate((shared_size + scratch_size0) *
+                                            league_size);
+      if(0<scratch_size1)
+        scratch = (char *)rocm_device_allocate(scratch_size1 * league_size);
+
+      auto create_team_member = [=](hc::tiled_index<1> idx, tile_desc td) 
+      { 
+        return typename Policy::member_type(idx, league_size, td.tile_size, shared, shared_size, scratch_size0, scratch, scratch_size1, vector_length); 
+      };
+
+      Kokkos::Impl::reduce_enqueue< typename Policy::work_tag >
+      ( league_size
+        , f
+        , reducer
+        , rocm_capture(invoke_fn{}, create_team_member)
+        , reducer.view().data()
+        , reducer.view().extent(0),team_size,vector_length
+        , shared_size
+     );
+
+      if(0<scratch_size1)
+        rocm_device_free(scratch);
+      rocm_device_free(shared);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void execute() const {}
+
+};
+
+
+template< class FunctorType , class... Traits >
+class ParallelScan< FunctorType , Kokkos::RangePolicy< Traits... >, Kokkos::Experimental::ROCm >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits... > Policy;
+  typedef typename Policy::work_tag Tag;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, Tag>  ValueTraits;
+
+public:
+
+  //----------------------------------------
+
+  inline
+  ParallelScan( const FunctorType & f
+              , const Policy      & policy )
+  {
+    const auto len = policy.end()-policy.begin();
+
+
+    if(len==0) return;
+
+    scan_enqueue<Tag>(len, f, [](hc::tiled_index<1> idx, int, int) { return idx.global[0]; });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void execute() const {}
+
+  //----------------------------------------
+};
+
+template< class FunctorType , class... Traits>
+class ParallelScan< FunctorType , Kokkos::TeamPolicy< Traits... >, Kokkos::Experimental::ROCm >
+{
+private:
+
+  using Policy = Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::ROCm, Traits... >;
+  typedef typename Policy::work_tag Tag;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, Tag>  ValueTraits;
+
+public:
+
+  //----------------------------------------
+
+  inline
+  ParallelScan( const FunctorType & f
+              , const Policy      & policy )
+  {
+    const auto league_size = policy.league_size();
+    const auto team_size = policy.team_size(f);
+    const auto len  = league_size * team_size;
+      
+    if(len == 0) return;
+
+    scan_enqueue<Tag>(len, f, [&](hc::tiled_index<1> idx, int n_teams, int n_leagues) { return typename Policy::member_type(idx,n_leagues,n_teams); });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void execute() const {}
+
+  //----------------------------------------
+};
+
+}
+}
+
+namespace Kokkos {
+namespace Impl {
+  template<typename iType>
+  struct TeamThreadRangeBoundariesStruct<iType,ROCmTeamMember> {
+    typedef iType index_type;
+    const iType start;
+    const iType end;
+    const iType increment;
+    const ROCmTeamMember& thread;
+
+#if defined( __HCC_ACCELERATOR__ )
+    KOKKOS_INLINE_FUNCTION
+    TeamThreadRangeBoundariesStruct (const ROCmTeamMember& thread_, const iType& count):
+      start( thread_.team_rank() ),
+      end( count ),
+      increment( thread_.team_size() ),
+      thread(thread_)
+    {}
+    KOKKOS_INLINE_FUNCTION
+    TeamThreadRangeBoundariesStruct (const ROCmTeamMember& thread_,  const iType& begin_, const iType& end_):
+      start( begin_ + thread_.team_rank() ),
+      end( end_ ),
+      increment( thread_.team_size() ),
+      thread(thread_)
+    {}
+#else
+    KOKKOS_INLINE_FUNCTION
+    TeamThreadRangeBoundariesStruct (const ROCmTeamMember& thread_, const iType& count):
+      start( 0 ),
+      end( count ),
+      increment( 1 ),
+      thread(thread_)
+    {}
+    KOKKOS_INLINE_FUNCTION
+    TeamThreadRangeBoundariesStruct (const ROCmTeamMember& thread_,  const iType& begin_, const iType& end_):
+      start( begin_ ),
+      end( end_ ),
+      increment( 1 ),
+      thread(thread_)
+    {}
+#endif
+  };
+  template<typename iType>
+  struct ThreadVectorRangeBoundariesStruct<iType,ROCmTeamMember> {
+    typedef iType index_type;
+    const iType start;
+    const iType end;
+    const iType increment;
+    const ROCmTeamMember& thread;
+
+#if defined( __HCC_ACCELERATOR__ )
+    KOKKOS_INLINE_FUNCTION
+    ThreadVectorRangeBoundariesStruct (const ROCmTeamMember& thread_, const iType& count):
+      start( thread_.lindex()%thread_.vector_length() ),
+      end( count ),
+      increment( thread_.vector_length() ),
+      thread(thread_)
+    {}
+
+//    KOKKOS_INLINE_FUNCTION
+//    ThreadVectorRangeBoundariesStruct (const iType& count):
+//      start( 0 ),
+//      end( count ),
+//      increment( 1 )
+//    {}
+#else
+    KOKKOS_INLINE_FUNCTION
+    ThreadVectorRangeBoundariesStruct (const ROCmTeamMember& thread_, const iType& count):
+      start( 0 ),
+      end( count ),
+      increment( 1 ),
+      thread(thread_)
+    {}
+    KOKKOS_INLINE_FUNCTION
+    ThreadVectorRangeBoundariesStruct (const iType& count):
+      start( 0 ),
+      end( count ),
+      increment( 1 )
+    {}
+#endif
+  };
+
+}
+}
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>
+  TeamThreadRange(const Impl::ROCmTeamMember& thread, const iType& count) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>(thread,count);
+}
+
+template<typename iType1,typename iType2>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<typename std::common_type< iType1, iType2 >::type,Impl::ROCmTeamMember>
+  TeamThreadRange(const Impl::ROCmTeamMember& thread, const iType1& begin, const iType2& end) {
+  typedef typename std::common_type< iType1, iType2 >::type iType;
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>(thread,begin,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >
+  ThreadVectorRange(const Impl::ROCmTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::ROCmTeamMember> PerTeam(const Impl::ROCmTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::ROCmTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::ROCmTeamMember> PerThread(const Impl::ROCmTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::ROCmTeamMember>(thread);
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::ROCmTeamMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.vector_rank()==0) lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::ROCmTeamMember>& single_struct, const FunctorType& lambda) {
+  if((single_struct.team_member.lindex()==0)) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::ROCmTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+#if defined(ROCM15)
+  // 1.5 needs this more proper restriction on which work units run
+  if( single_struct.team_member.vector_rank()==0) lambda(val);
+  val = shfl(val,0,single_struct.team_member.vector_length());
+#else
+  // but older compilers are fine with this (TestTeamVector::Test< Kokkos::Experimental::ROCm >(4))
+  lambda(val);
+#endif
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::ROCmTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.lindex()==0) lambda(val);
+  single_struct.team_member.team_broadcast(val,0);
+}
+
+}
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+  result = loop_boundaries.thread.team_reduce(result,
+                                              Impl::JoinAdd<ValueType>());
+//  Impl::rocm_intra_workgroup_reduction( loop_boundaries.thread, result,
+//               Impl::JoinAdd<ValueType>());
+//  Impl::rocm_inter_workgroup_reduction( loop_boundaries.thread, result,
+//               Impl::JoinAdd<ValueType>());
+}
+
+/** \brief  Inter-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ReducerType const & reducer) {
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+  loop_boundaries.thread.team_reduce(reducer);
+}
+
+/** \brief  Intra-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& result) {
+
+#if defined(ROCM15)
+  ValueType tmp = result;
+  //  Simpler code works with ROCM1.5
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,tmp);
+  }
+  result = loop_boundaries.thread.team_reduce(tmp,join);
+#else
+  // this workaround freezes up with ROCM1.5, but needed for earlier compilers
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  result = loop_boundaries.thread.team_reduce(result,join);
+#endif
+//  Impl::rocm_intra_workgroup_reduction( loop_boundaries.thread, result,join);
+//  Impl::rocm_inter_workgroup_reduction( loop_boundaries.thread, result,join);
+}
+
+} //namespace Kokkos
+
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+  result = loop_boundaries.thread.thread_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& result) {
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);  
+    loop_boundaries.thread.team_barrier();
+  }
+  result = loop_boundaries.thread.thread_reduce(result,join);
+}
+
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
+      loop_boundaries, const Lambda & lambda, ReducerType const & reducer) {
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+  loop_boundaries.thread.vector_reduce(reducer);
+}
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ReducerType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ReducerType const & reducer) {
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());  
+    loop_boundaries.thread.team_barrier();
+  }
+  reducer.reference() = loop_boundaries.thread.thread_reduce(reducer.reference(),join);
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+#if (__ROCM_ARCH__ >= 800)
+// adopt the cuda vector shuffle method
+  const int VectorLength = loop_boundaries.increment;
+  int lid = loop_boundaries.thread.lindex();
+  int vector_rank = lid%VectorLength;
+
+  iType loop_bound = ((loop_boundaries.end+VectorLength-1)/VectorLength) * VectorLength;
+  value_type val ;
+  for(int _i = vector_rank; _i < loop_bound; _i += VectorLength) {
+    val = value_type();
+    if(_i<loop_boundaries.end)
+      lambda(_i , val , false);
+
+    value_type tmp = val;
+    value_type result_i;
+
+    if(vector_rank == 0)
+      result_i = tmp;
+    if (VectorLength > 1) {
+      const value_type tmp2 = shfl_up(tmp, 1,VectorLength);
+      if(vector_rank > 0)
+        tmp+=tmp2;
+    }
+    if(vector_rank == 1)
+      result_i = tmp;
+    if (VectorLength > 3) {
+      const value_type tmp2 = shfl_up(tmp, 2,VectorLength);
+      if(vector_rank > 1)
+        tmp+=tmp2;
+    }
+    if ((vector_rank >= 2) &&
+        (vector_rank < 4))
+      result_i = tmp;
+    if (VectorLength > 7) {
+      const value_type tmp2 = shfl_up(tmp, 4,VectorLength);
+      if(vector_rank > 3)
+        tmp+=tmp2;
+    }
+    if ((vector_rank >= 4) &&
+        (vector_rank < 8))
+      result_i = tmp;
+    if (VectorLength > 15) {
+      const value_type tmp2 = shfl_up(tmp, 8,VectorLength);
+      if(vector_rank > 7)
+        tmp+=tmp2;
+    }
+    if ((vector_rank >= 8) &&
+        (vector_rank < 16))
+      result_i = tmp;
+    if (VectorLength > 31) {
+      const value_type tmp2 = shfl_up(tmp, 16,VectorLength);
+      if(vector_rank > 15)
+        tmp+=tmp2;
+    }
+    if ((vector_rank >=16) &&
+        (vector_rank < 32))
+      result_i = tmp;
+    if (VectorLength > 63) {
+      const value_type tmp2 = shfl_up(tmp, 32,VectorLength);
+      if(vector_rank > 31)
+        tmp+=tmp2;
+    }
+
+    if (vector_rank >= 32)
+      result_i = tmp;
+
+    val = scan_val + result_i - val;
+    scan_val += shfl(tmp,VectorLength-1,VectorLength);
+    if(_i<loop_boundaries.end)
+      lambda(_i , val , true);
+  }
+#else
+// for kaveri, call the LDS based thread_scan routine
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+  scan_val = loop_boundaries.thread.team_scan(scan_val);
+
+#endif
+}
+
+} // namespace Kokkos
+
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..293fca2ad76809f0453ddbde7e326f9b418477d8
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
@@ -0,0 +1,196 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+// AMP REDUCE
+//////////////////////////////////////////////////////////////////////////////
+
+#if !defined( KOKKOS_ROCM_AMP_REDUCE_INL )
+#define KOKKOS_ROCM_AMP_REDUCE_INL
+
+#include <iostream>
+
+#include <algorithm>
+#include <numeric>
+#include <cmath>
+#include <type_traits>
+#include <ROCm/Kokkos_ROCm_Tile.hpp>
+#include <ROCm/Kokkos_ROCm_Invoke.hpp>
+#include <ROCm/Kokkos_ROCm_Join.hpp>
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace Kokkos {
+namespace Impl {
+
+template<class T>
+T* reduce_value(T* x, std::true_type) [[hc]]
+{
+  return x;
+}
+
+template<class T>
+T& reduce_value(T* x, std::false_type) [[hc]]
+{
+  return *x;
+}
+
+#if KOKKOS_ROCM_HAS_WORKAROUNDS
+struct always_true
+{
+    template<class... Ts>
+    bool operator()(Ts&&...) const
+    {
+        return true;
+    }
+};
+#endif
+
+template< class Tag, class F, class ReducerType, class Invoker, class T >
+void reduce_enqueue(
+  const int szElements,  // size of the extent
+  const F & f,
+  const ReducerType& reducer,
+  Invoker invoke,
+  T * const output_result,
+  int const output_length,
+  const int team_size=64,
+  const int vector_size=1,
+  int const shared_size=0)
+{
+  using namespace hc ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, F, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, Tag, void >::type TagFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , TagFwd > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , TagFwd >   ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , TagFwd >   ValueJoin ;
+  typedef Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagFwd >       ValueFinal ;
+
+  typedef typename ValueTraits::pointer_type   pointer_type ;
+  typedef typename ValueTraits::reference_type reference_type ;
+
+  if (output_length < 1) return;
+
+  const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);
+
+  // allocate host and device memory for the results from each team
+  std::vector<T> result_cpu(td.num_tiles*output_length);
+  hc::array<T> result(td.num_tiles*output_length);
+
+  auto fut = tile_for<T[]>(td, [=,&result](hc::tiled_index<1> t_idx, tile_buffer<T[]> buffer) [[hc]] 
+  {
+      const auto local = t_idx.local[0];
+      const auto global = t_idx.global[0];
+      const auto tile = t_idx.tile[0];
+
+      buffer.action_at(local, [&](T* state)
+      {
+          ValueInit::init(ReducerConditional::select(f, reducer), state);
+          invoke(make_rocm_invoke_fn<Tag>(f), t_idx, td, reduce_value(state, std::is_pointer<reference_type>()));
+      });
+      t_idx.barrier.wait();
+
+      // Reduce within a tile using multiple threads.
+// even though buffer.size is always 64, the value 64 must be hard coded below
+// due to a compiler bug
+//      for(std::size_t s = 1; s < buffer.size(); s *= 2)
+      for(std::size_t s = 1; s < 64; s *= 2)
+      {
+          const std::size_t index = 2 * s * local;
+//          if (index < buffer.size())
+          if (index < 64)
+          {
+              buffer.action_at(index, index + s, [&](T* x, T* y)
+              {
+                  ValueJoin::join(ReducerConditional::select(f, reducer), x, y);
+              });
+          }
+          t_idx.barrier.wait();
+      }
+
+      // Store the tile result in the global memory.
+      if (local == 0)
+      {
+#if KOKKOS_ROCM_HAS_WORKAROUNDS
+          // Workaround for assigning from LDS memory: std::copy should work
+          // directly
+          buffer.action_at(0, [&](T* x)
+          {
+#if ROCM15
+// new ROCM 15 address space changes aren't implemented in std algorithms yet
+              auto * src = reinterpret_cast<char *>(x);
+              auto * dest = reinterpret_cast<char *>(result.data()+tile*output_length);
+              for(int i=0; i<sizeof(T);i++) dest[i] = src[i];
+#else
+              // Workaround: copy_if used to avoid memmove
+              std::copy_if(x, x+output_length, result.data()+tile*output_length, always_true{} );
+#endif
+          });
+#else
+          std::copy(buffer, buffer+output_length, result.data()+tile*output_length);
+
+#endif
+      }
+      
+  });
+  if (output_result != nullptr)
+     ValueInit::init(ReducerConditional::select(f, reducer), output_result);
+  fut.wait();
+
+  copy(result,result_cpu.data());
+  if (output_result != nullptr) {
+    for(std::size_t i=0;i<td.num_tiles;i++)
+       ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
+
+    ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
+  }
+
+}
+
+}} //end of namespace Kokkos::Impl
+
+#endif /* #if !defined( KOKKOS_ROCM_AMP_REDUCE_INL ) */
+
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_ReduceScan.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_ReduceScan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f67089b9204a3396e6d829a1c82e17dc2a04cc6
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_ReduceScan.hpp
@@ -0,0 +1,605 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ROCM_REDUCESCAN_HPP
+#define KOKKOS_ROCM_REDUCESCAN_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if ROCM is enabled for Kokkos */
+#if defined( __HCC__ ) && defined( KOKKOS_ENABLE_ROCM )
+
+//#include <utility>
+
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <ROCm/Kokkos_ROCm_Vectorization.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl( *reinterpret_cast<int const *>(&in) , lane , width );
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl( reinterpret_cast<int const *>(&in)[i] , lane , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl_down( *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl_down( reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl_up( *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl_up( reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
+#if 0
+//----------------------------------------------------------------------------
+/** \brief  Reduce within a workgroup over team.vector_length(), the "vector" dimension.
+ *
+ *  This will be called within a nested, intra-team parallel operation.
+ *  Use shuffle operations to avoid conflicts with shared memory usage.
+ *
+ *  Requires:
+ *    team.vector_length() is power of 2
+ *    team.vector_length() <= 32 (one workgroup)
+ *
+ *  Cannot use "butterfly" pattern because floating point
+ *  addition is non-associative.  Therefore, must broadcast
+ *  the final result.
+ */
+template< class Reducer >
+KOKKOS_INLINE_FUNCTION
+void rocm_intra_workgroup_vector_reduce( Reducer const & reducer )
+{
+  static_assert(
+    std::is_reference< typename Reducer::reference_type >::value , "" );
+
+  if ( 1 < team.vector_length() ) {
+
+    typename Reducer::value_type tmp ;
+
+    for ( int i = team.vector_length() ; ( i >>= 1 ) ; ) {
+
+      rocm_shfl_down( tmp , reducer.reference() , i , team.vector_length() );
+
+      if ( team.vector_rank() < i ) { reducer.join( reducer.data() , & tmp ); }
+    }
+
+    // Broadcast from root "lane" to all other "lanes"
+
+    rocm_shfl( reducer.reference() , reducer.reference() , 0 , team.vector_length() );
+  }
+}
+
+/** \brief  Inclusive scan over team.vector_length(), the "vector" dimension.
+ *
+ *  This will be called within a nested, intra-team parallel operation.
+ *  Use shuffle operations to avoid conflicts with shared memory usage.
+ *
+ *  Algorithm is concurrent bottom-up reductions in triangular pattern
+ *  where each ROCM thread is the root of a reduction tree from the
+ *  zeroth ROCM thread to itself.
+ *
+ *  Requires:
+ *    team.vector_length() is power of 2
+ *    team.vector_length() <= 32 (one workgroup)
+ */
+template< typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void rocm_intra_workgroup_vector_inclusive_scan( ValueType & local )
+{
+  ValueType tmp ;
+
+  // Bottom up:
+  //   [t] += [t-1] if t >= 1
+  //   [t] += [t-2] if t >= 2
+  //   [t] += [t-4] if t >= 4
+  // ...
+
+  for ( int i = 1 ; i < team.vector_length() ; i <<= 1 ) {
+
+    rocm_shfl_up( tmp , local , i , team.vector_length() );
+
+    if ( i <= team.vector_rank() ) { local += tmp ; }
+  }
+}
+#endif
+
+//----------------------------------------------------------------------------
+/*
+ *  Algorithmic constraints:
+ *   (a) threads with same team.team_rank() have same value
+ *   (b) team.vector_length() == power of two
+ *   (c) blockDim.z == 1
+ */
+
+template< class ValueType , class JoinOp>
+KOKKOS_INLINE_FUNCTION
+void rocm_intra_workgroup_reduction( const ROCmTeamMember& team, 
+                                       ValueType& result,
+                                       const JoinOp& join) {
+
+  unsigned int shift = 1;
+  int max_active_thread = team.team_size();
+
+  //Reduce over values from threads with different team.team_rank()
+  while(team.vector_length() * shift < 32 ) {
+    const ValueType tmp = shfl_down(result, team.vector_length()*shift,32u);
+    //Only join if upper thread is active (this allows non power of two for team.team_size()
+    if(team.team_rank() + shift < max_active_thread)
+      join(result , tmp);
+    shift*=2;
+  }
+
+  result = shfl(result,0,32);
+}
+
+template< class ValueType , class JoinOp>
+KOKKOS_INLINE_FUNCTION
+void rocm_inter_workgroup_reduction( const ROCmTeamMember& team,
+                                       ValueType& value,
+                                       const JoinOp& join) {
+
+  #define STEP_WIDTH 4
+  
+  tile_static ValueType sh_result[256];
+  int max_active_thread = team.team_size();
+  ValueType* result = (ValueType*) & sh_result;
+  const unsigned step = 256 / team.vector_length();
+  unsigned shift = STEP_WIDTH;
+  const int id = team.team_rank()%step==0?team.team_rank()/step:65000;
+  if(id < STEP_WIDTH ) {
+    result[id] = value;
+  }
+  team.team_barrier();
+
+  while (shift<=max_active_thread/step) {
+    if(shift<=id && shift+STEP_WIDTH>id && team.vector_rank()==0) {
+      join(result[id%STEP_WIDTH],value);
+    }
+    team.team_barrier();
+    shift+=STEP_WIDTH;
+  }
+
+
+  value = result[0];
+  for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
+    join(value,result[i]);
+}
+
+#if 0
+template< class ValueType , class JoinOp>
+KOKKOS_INLINE_FUNCTION
+void rocm_intra_block_reduction( ROCmTeamMember& team,
+                                        ValueType& value,
+                                        const JoinOp& join,
+                                        const int max_active_thread) {
+  rocm_intra_workgroup_reduction(team,value,join,max_active_thread);
+  rocm_inter_workgroup_reduction(team,value,join,max_active_thread);
+}
+
+template< class FunctorType , class JoinOp , class ArgTag = void >
+KOKKOS_INLINE_FUNCTION
+bool rocm_inter_block_reduction( ROCmTeamMember& team,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  value,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  neutral,
+                                 const JoinOp& join,
+                                 ROCm::size_type * const m_scratch_space,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
+                                 ROCm::size_type * const m_scratch_flags,
+                                 const int max_active_thread) {
+#ifdef __ROCM_ARCH__
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
+
+  //Do the intra-block reduction with shfl operations and static shared memory
+  rocm_intra_block_reduction(value,join,max_active_thread);
+
+  const unsigned id = team.team_rank()*team.vector_length() + team.vector_rank();
+
+  //One thread in the block writes block result to global scratch_memory
+  if(id == 0 ) {
+    pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
+    *global = value;
+  }
+
+  //One workgroup of last block performs inter block reduction through loading the block values from global scratch_memory
+  bool last_block = false;
+
+  team.team_barrier();
+  if ( id < 32 ) {
+    ROCm::size_type count;
+
+    //Figure out whether this is the last block
+    if(id == 0)
+      count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
+    count = Kokkos::shfl(count,0,32);
+
+    //Last block does the inter block reduction
+    if( count == gridDim.x - 1) {
+      //set flag back to zero
+      if(id == 0)
+        *m_scratch_flags = 0;
+      last_block = true;
+      value = neutral;
+
+      pointer_type const volatile global = (pointer_type) m_scratch_space ;
+
+      //Reduce all global values with splitting work over threads in one workgroup
+      const int step_size = team.vector_length()*team.team_size() < 32 ? team.vector_length()*team.team_size() : 32;
+      for(int i=id; i<gridDim.x; i+=step_size) {
+        value_type tmp = global[i];
+        join(value, tmp);
+      }
+
+      //Perform shfl reductions within the workgroup only join if contribution is valid (allows gridDim.x non power of two and <32)
+      if (team.vector_length()*team.team_size() > 1) {
+        value_type tmp = Kokkos::shfl_down(value, 1,32);
+        if( id + 1 < gridDim.x )
+          join(value, tmp);
+      }
+      if (team.vector_length()*team.team_size() > 2) {
+        value_type tmp = Kokkos::shfl_down(value, 2,32);
+        if( id + 2 < gridDim.x )
+          join(value, tmp);
+      }
+      if (team.vector_length()*team.team_size() > 4) {
+        value_type tmp = Kokkos::shfl_down(value, 4,32);
+        if( id + 4 < gridDim.x )
+          join(value, tmp);
+      }
+      if (team.vector_length()*team.team_size() > 8) {
+        value_type tmp = Kokkos::shfl_down(value, 8,32);
+        if( id + 8 < gridDim.x )
+          join(value, tmp);
+      }
+      if (team.vector_length()*team.team_size() > 16) {
+        value_type tmp = Kokkos::shfl_down(value, 16,32);
+        if( id + 16 < gridDim.x )
+          join(value, tmp);
+      }
+    }
+  }
+
+  //The last block has in its thread=0 the global reduction value through "value"
+  return last_block;
+#else
+  return true;
+#endif
+}
+#endif
+#if 0
+
+//----------------------------------------------------------------------------
+// See section B.17 of ROCm C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize shared memory and minimize L1 cache:
+//   rocmFuncSetCacheConfig(MyKernel, rocmFuncCachePreferShared );
+// For 2.0 capability: 48 KB shared and 16 KB L1
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/*
+ *  Algorithmic constraints:
+ *   (a) team.team_size() is a power of two
+ *   (b) team.team_size() <= 512
+ *   (c) team.vector_length() == blockDim.z == 1
+ */
+
+template< bool DoScan , class FunctorType , class ArgTag >
+KOKKOS_INLINE_FUNCTION
+void rocm_intra_block_reduce_scan( const FunctorType & functor ,
+                                   const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data )
+{
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type  pointer_type ;
+
+  const unsigned value_count   = ValueTraits::value_count( functor );
+  const unsigned BlockSizeMask = team.team_size() - 1 ;
+
+  // Must have power of two thread count
+
+  if ( BlockSizeMask & team.team_size() ) { Kokkos::abort("ROCm::rocm_intra_block_scan requires power-of-two blockDim"); }
+
+#define BLOCK_REDUCE_STEP( R , TD , S )  \
+  if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); }
+
+#define BLOCK_SCAN_STEP( TD , N , S )  \
+  if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); }
+
+  const unsigned     rtid_intra = team.team_rank() ^ BlockSizeMask ;
+  const pointer_type tdata_intra = base_data + value_count * team.team_rank() ;
+
+  { // Intra-workgroup reduction:
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
+  }
+
+  team.team_barrier(); // Wait for all workgroups to reduce
+
+  { // Inter-workgroup reduce-scan by a single workgroup to avoid extra synchronizations
+    const unsigned rtid_inter = ( team.team_rank() ^ BlockSizeMask ) << ROCmTraits::WarpIndexShift ;
+
+    if ( rtid_inter < team.team_size() ) {
+
+      const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
+
+      if ( (1<<5) < BlockSizeMask ) {                        BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
+      if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
+      if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
+      if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
+
+      if ( DoScan ) {
+
+        int n = ( rtid_inter &  32 ) ?  32 : (
+                ( rtid_inter &  64 ) ?  64 : (
+                ( rtid_inter & 128 ) ? 128 : (
+                ( rtid_inter & 256 ) ? 256 : 0 )));
+
+        if ( ! ( rtid_inter + n < team.team_size() ) ) n = 0 ;
+
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,8)
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,7)
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,6)
+        __threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,5)
+      }
+    }
+  }
+
+  team.team_barrier(); // Wait for inter-workgroup reduce-scan to complete
+
+  if ( DoScan ) {
+    int n = ( rtid_intra &  1 ) ?  1 : (
+            ( rtid_intra &  2 ) ?  2 : (
+            ( rtid_intra &  4 ) ?  4 : (
+            ( rtid_intra &  8 ) ?  8 : (
+            ( rtid_intra & 16 ) ? 16 : 0 ))));
+
+    if ( ! ( rtid_intra + n < team.team_size() ) ) n = 0 ;
+    #ifdef KOKKOS_IMPL_ROCM_CLANG_WORKAROUND
+    BLOCK_SCAN_STEP(tdata_intra,n,4) team.team_barrier();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,3) team.team_barrier();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,2) team.team_barrier();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,1) team.team_barrier();//__threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,0) team.team_barrier();
+    #else
+    BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,0) __threadfence_block();
+    #endif
+  }
+
+#undef BLOCK_SCAN_STEP
+#undef BLOCK_REDUCE_STEP
+}
+
+//----------------------------------------------------------------------------
+/**\brief  Input value-per-thread starting at 'shared_data'.
+ *         Reduction value at last thread's location.
+ *
+ *  If 'DoScan' then write blocks' scan values and block-groups' scan values.
+ *
+ *  Global reduce result is in the last threads' 'shared_data' location.
+ */
+template< bool DoScan , class FunctorType , class ArgTag >
+KOKKOS_INLINE_FUNCTION
+bool rocm_single_inter_block_reduce_scan( const FunctorType     & functor ,
+                                          const ROCm::size_type   block_id ,
+                                          const ROCm::size_type   block_count ,
+                                          ROCm::size_type * const shared_data ,
+                                          ROCm::size_type * const global_data ,
+                                          ROCm::size_type * const global_flags )
+{
+  typedef ROCm::size_type                  size_type ;
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+  typedef FunctorValueInit<   FunctorType , ArgTag >  ValueInit ;
+  typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+
+  // '__ffs' = position of the least significant bit set to 1.
+  // 'team.team_size()' is guaranteed to be a power of two so this
+  // is the integral shift value that can replace an integral divide.
+  const unsigned BlockSizeShift = __ffs( team.team_size() ) - 1 ;
+  const unsigned BlockSizeMask  = team.team_size() - 1 ;
+
+  // Must have power of two thread count
+  if ( BlockSizeMask & team.team_size() ) { Kokkos::abort("ROCm::rocm_single_inter_block_reduce_scan requires power-of-two blockDim"); }
+
+  const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+    word_count( ValueTraits::value_size( functor ) / sizeof(size_type) );
+
+  // Reduce the accumulation for the entire block.
+  rocm_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+  {
+    // Write accumulation total to global scratch space.
+    // Accumulation total is the last thread's data.
+    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
+    size_type * const global = global_data + word_count.value * block_id ;
+
+#if (__ROCM_ARCH__ < 500)
+    for ( size_type i = team.team_rank() ; i < word_count.value ; i += team.team_size() ) { global[i] = shared[i] ; }
+#else
+    for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
+#endif
+
+  }
+
+  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
+  // If this block is not the last block to contribute to this group then the block is done.
+    team.team_barrier();
+  const bool is_last_block =
+    ! team.team_reduce( team.team_rank() ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) ,Impl::JoinAdd<ValueType>());
+
+  if ( is_last_block ) {
+
+    const size_type b = ( long(block_count) * long(team.team_rank()) ) >> BlockSizeShift ;
+    const size_type e = ( long(block_count) * long( team.team_rank() + 1 ) ) >> BlockSizeShift ;
+
+    {
+      void * const shared_ptr = shared_data + word_count.value * team.team_rank() ;
+      reference_type shared_value = ValueInit::init( functor , shared_ptr );
+
+      for ( size_type i = b ; i < e ; ++i ) {
+        ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
+      }
+    }
+
+    rocm_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+    if ( DoScan ) {
+
+      size_type * const shared_value = shared_data + word_count.value * ( team.team_rank() ? team.team_rank() - 1 : team.team_size() );
+
+      if ( ! team.team_rank() ) { ValueInit::init( functor , shared_value ); }
+
+      // Join previous inclusive scan value to each member
+      for ( size_type i = b ; i < e ; ++i ) {
+        size_type * const global_value = global_data + word_count.value * i ;
+        ValueJoin::join( functor , shared_value , global_value );
+        ValueOps ::copy( functor , global_value , shared_value );
+      }
+    }
+  }
+
+  return is_last_block ;
+}
+
+// Size in bytes required for inter block reduce or scan
+template< bool DoScan , class FunctorType , class ArgTag >
+inline
+unsigned rocm_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
+{
+  return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor );
+}
+#endif 
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( __ROCMCC__ ) */
+#endif /* KOKKOS_ROCM_REDUCESCAN_HPP */
+
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9890598bc980fcda3e2f1446d46689cc2d1332d2
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
@@ -0,0 +1,157 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <ROCm/Kokkos_ROCm_Invoke.hpp>
+#include <ROCm/Kokkos_ROCm_Join.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template< class Tag, class F, class TransformIndex>
+void scan_enqueue(
+  const int len,
+  const F & f,
+  TransformIndex transform_index)
+{
+    typedef Kokkos::Impl::FunctorValueTraits< F, Tag>  ValueTraits;
+    typedef Kokkos::Impl::FunctorValueInit<   F, Tag>  ValueInit;
+    typedef Kokkos::Impl::FunctorValueJoin<   F, Tag>  ValueJoin;
+    typedef Kokkos::Impl::FunctorValueOps<    F, Tag>  ValueOps;
+
+    typedef typename ValueTraits::value_type    value_type;
+    typedef typename ValueTraits::pointer_type    pointer_type;
+    typedef typename ValueTraits::reference_type  reference_type;
+
+    const auto td = get_tile_desc<value_type>(len);
+    std::vector<value_type> result_cpu(td.num_tiles);
+    hc::array<value_type> result(td.num_tiles);
+    hc::array<value_type> scratch(len);
+
+    tile_for<value_type>(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]] 
+    {
+        const auto local = t_idx.local[0];
+        const auto global = t_idx.global[0];
+        const auto tile = t_idx.tile[0];
+
+        // Join tile buffer elements
+        const auto join = [&](std::size_t i, std::size_t j)
+        {
+            buffer.action_at(i, j, [&](value_type& x, const value_type& y)
+            {
+                ValueJoin::join(f, &x, &y);
+            });
+        };
+
+        // Copy into tile
+        buffer.action_at(local, [&](value_type& state)
+        {
+            ValueInit::init(f, &state);
+            if (global < len) rocm_invoke<Tag>(f, transform_index(t_idx, td.tile_size, td.num_tiles), state, false);
+        });
+        t_idx.barrier.wait();
+        // Up sweep phase
+        for(std::size_t d=1;d<buffer.size();d*=2)
+        {
+            auto d2 = 2*d;
+            auto i = local*d2;
+            if(i<len)
+            {
+               auto j = i + d - 1;
+               auto k = i + d2 - 1;
+//               join(k, j);  // no longer needed with ROCm 1.6
+               ValueJoin::join(f, &buffer[k], &buffer[j]);
+            }
+        }
+        t_idx.barrier.wait();
+
+        result[tile] = buffer[buffer.size()-1];
+        buffer[buffer.size()-1] = 0;
+        // Down sweep phase
+        for(std::size_t d=buffer.size()/2;d>0;d/=2)
+        {
+            auto d2 = 2*d;
+            auto i = local*d2;
+            if(i<len)
+            {
+               auto j = i + d - 1;
+               auto k = i + d2 - 1;
+               auto t = buffer[k];
+//               join(k, j);  // no longer needed with ROCm 1.6
+               ValueJoin::join(f, &buffer[k], &buffer[j]);
+               buffer[j] = t;
+            }
+            t_idx.barrier.wait();
+        }
+        // Copy tiles into global memory
+        if (global < len) scratch[global] = buffer[local];
+    }).wait();
+    copy(result,result_cpu.data());
+
+//  The std::partial_sum was segfaulting, despite that this is cpu code.
+//   if(td.num_tiles>1)
+//      std::partial_sum(result_cpu.data(), result_cpu.data()+(td.num_tiles-1)*sizeof(value_type), result_cpu.data(), make_join_operator<ValueJoin>(f));
+// use this implementation instead.
+   for(int i=1; i<td.num_tiles; i++)
+      ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);
+
+    copy(result_cpu.data(),result);
+    hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]] 
+    {
+//        const auto local = t_idx.local[0];
+        const auto global = t_idx.global[0];
+        const auto tile = t_idx.tile[0];
+
+        if (global < len) 
+        {
+            auto final_state = scratch[global];
+
+// the join is locking up, at least with 1.6
+            if (tile != 0) final_state += result[tile-1];
+//            if (tile != 0) ValueJoin::join(f, &final_state, &result[tile-1]);
+            rocm_invoke<Tag>(f, transform_index(t_idx, td.tile_size, td.num_tiles), final_state, true);
+        }
+    }).wait();
+}
+
+} // namespace Impl
+} // namespace Kokkos
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Space.cpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Space.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed6c517e9b6d4e3d5766234c19a4922a06c9735d
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Space.cpp
@@ -0,0 +1,726 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <algorithm>
+#include <atomic>
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if ROCM is enabled for Kokkos */
+#ifdef KOKKOS_ENABLE_ROCM
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_ROCm.hpp>
+#include <Kokkos_ROCmSpace.hpp>
+
+#include <impl/Kokkos_Error.hpp>
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#endif
+
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+#define ROCM_SAFE_CALL
+namespace Kokkos {
+namespace Impl {
+using namespace hc;
+
+DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+
+DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+
+
+DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+
+DeepCopy<HostSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<HostSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
+{
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   av.copy( src , dst , n);
+}
+
+
+hc::completion_future DeepCopyAsyncROCm( void * dst , const void * src , size_t n) {
+   hc::accelerator acc;
+   hc::accelerator_view av = acc.get_default_view();
+   return(av.copy_async( src , dst , n));
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+
+namespace Kokkos {
+
+void Experimental::ROCmSpace::access_error()
+{
+  const std::string msg("Kokkos::Experimental::ROCmSpace::access_error attempt to execute Experimental::ROCm function from non-ROCm space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+void Experimental::ROCmSpace::access_error( const void * const )
+{
+  const std::string msg("Kokkos::Experimental::ROCmSpace::access_error attempt to execute Experimental::ROCm function from non-ROCm space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+
+ROCmSpace::ROCmSpace()
+  : m_device( ROCm().rocm_device() )
+{
+}
+
+ROCmHostPinnedSpace::ROCmHostPinnedSpace()
+{
+}
+
+void * ROCmSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr =  Kokkos::Impl::rocm_device_allocate( arg_alloc_size );
+  return ptr ;
+}
+
+void * Experimental::ROCmHostPinnedSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr =  Kokkos::Impl::rocm_hostpinned_allocate( arg_alloc_size );
+  return ptr ;
+}
+
+void ROCmSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  Kokkos::Impl::rocm_device_free(arg_alloc_ptr);
+}
+
+void Experimental::ROCmHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  Kokkos::Impl::rocm_device_free(arg_alloc_ptr);
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::s_root_record ;
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::s_root_record ;
+
+
+std::string
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_label() const
+{
+  SharedAllocationHeader header ;
+
+  Kokkos::Impl::DeepCopy< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace >( & header , RecordBase::head() , sizeof(SharedAllocationHeader) );
+
+  return std::string( header.m_label );
+}
+
+std::string
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_label() const
+{
+  return std::string( RecordBase::head()->m_label );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+allocate( const Kokkos::Experimental::ROCmSpace &  arg_space
+        , const std::string       &  arg_label
+        , const size_t               arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+allocate( const Kokkos::Experimental::ROCmHostPinnedSpace &  arg_space
+        , const std::string                 &  arg_label
+        , const size_t                         arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+void
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+void
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+~SharedAllocationRecord()
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+
+    SharedAllocationHeader header ;
+    Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace>( & header , RecordBase::m_alloc_ptr , sizeof(SharedAllocationHeader) );
+
+    Kokkos::Profiling::deallocateData(
+      Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::ROCmSpace::name()),header.m_label,
+      data(),size());
+  }
+  #endif
+
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+~SharedAllocationRecord()
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::deallocateData(
+      Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::ROCmHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
+      data(),size());
+  }
+  #endif
+
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+SharedAllocationRecord( const Kokkos::Experimental::ROCmSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
+  }
+  #endif
+
+  SharedAllocationHeader header ;
+
+  // Fill in the Header information
+  header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( header.m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+
+  // Copy to device memory
+  Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace>( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+SharedAllocationRecord( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
+                      , const std::string                 & arg_label
+                      , const size_t                        arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
+  }
+  #endif
+  // Fill in the Header information, directly accessible via host pinned memory
+
+  RecordBase::m_alloc_ptr->m_record = this ;
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+//----------------------------------------------------------------------------
+
+void * SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+allocate_tracked( const Kokkos::Experimental::ROCmSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+#if 0
+void * SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+allocate_tracked( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<Experimental::ROCmHostPinnedSpace,Experimental::ROCmHostPinnedSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordBase = SharedAllocationRecord< void , void > ;
+  using RecordROCm = SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > ;
+
+#if 0
+  // Copy the header from the allocation
+  Header head ;
+
+  Header const * const head_rocm = alloc_ptr ? Header::get_header( alloc_ptr ) : (Header*) 0 ;
+
+  if ( alloc_ptr ) {
+    Kokkos::Impl::DeepCopy<HostSpace,Experimental::ROCmSpace>( & head , head_rocm , sizeof(SharedAllocationHeader) );
+  }
+
+  RecordROCm * const record = alloc_ptr ? static_cast< RecordROCm * >( head.m_record ) : (RecordROCm *) 0 ;
+
+  if ( ! alloc_ptr || record->m_alloc_ptr != head_rocm ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record ERROR" ) );
+  }
+
+#else
+
+  // Iterate the list to search for the record among all allocations
+  // requires obtaining the root of the list and then locking the list.
+
+  RecordROCm * const record = static_cast< RecordROCm * >( RecordBase::find( & s_root_record , alloc_ptr ) );
+
+  if ( record == 0 ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record ERROR" ) );
+  }
+
+#endif
+
+  return record ;
+}
+
+#if  0
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordROCm = SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > ;
+
+  Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
+
+  if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_record ERROR" ) );
+  }
+
+  return static_cast< RecordROCm * >( h->m_record );
+}
+#endif 
+
+// Iterate records to print orphaned memory ...
+void
+SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
+print_records( std::ostream & s , const Kokkos::Experimental::ROCmSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void > * r = & s_root_record ;
+
+  char buffer[256] ;
+
+  SharedAllocationHeader head ;
+
+  if ( detail ) {
+    do {
+      if ( r->m_alloc_ptr ) {
+        Kokkos::Impl::DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace>( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+      }
+      else {
+        head.m_label[0] = 0 ;
+      }
+
+      //Formatting dependent on sizeof(uintptr_t)
+      const char * format_string;
+
+      if (sizeof(uintptr_t) == sizeof(unsigned long)) { 
+        format_string = "ROCm addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
+      }
+      else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { 
+        format_string = "ROCm addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
+      }
+
+      snprintf( buffer , 256 
+              , format_string
+              , reinterpret_cast<uintptr_t>( r )
+              , reinterpret_cast<uintptr_t>( r->m_prev )
+              , reinterpret_cast<uintptr_t>( r->m_next )
+              , reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
+              , r->m_alloc_size
+              , r->m_count
+              , reinterpret_cast<uintptr_t>( r->m_dealloc )
+              , head.m_label
+              );
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+  else {
+    do {
+      if ( r->m_alloc_ptr ) {
+
+        Kokkos::Impl::DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace>( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+
+        //Formatting dependent on sizeof(uintptr_t)
+        const char * format_string;
+
+        if (sizeof(uintptr_t) == sizeof(unsigned long)) { 
+          format_string = "ROCm [ 0x%.12lx + %ld ] %s\n";
+        }
+        else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { 
+          format_string = "ROCm [ 0x%.12llx + %ld ] %s\n";
+        }
+
+        snprintf( buffer , 256 
+                , format_string
+                , reinterpret_cast< uintptr_t >( r->data() )
+                , r->size()
+                , head.m_label
+                );
+      }
+      else {
+        snprintf( buffer , 256 , "ROCm [ 0 + 0 ]\n" );
+      }
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+}
+#if 0
+void
+SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
+print_records( std::ostream & s , const Kokkos::Experimental::ROCmHostPinnedSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "ROCmHostPinned" , & s_root_record , detail );
+}
+#endif 
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+namespace Kokkos {
+namespace {
+#if 0
+  KOKKOS_INLINE_FUNCTION void init_lock_array_kernel_atomic() {
+    unsigned i = tindex()*team_size() + lindex();
+
+    if(i<ROCM_SPACE_ATOMIC_MASK+1)
+      kokkos_impl_rocm_lock_arrays.atomic[i] = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION void init_lock_array_kernel_scratch_threadid(int N) {
+    unsigned i = tindex()*team_size() + lindex();
+
+    if(i<N) {
+      kokkos_impl_rocm_lock_arrays.scratch[i] = 0;
+      kokkos_impl_rocm_lock_arrays.threadid[i] = 0;
+    }
+  }
+}
+
+
+namespace Impl {
+int* atomic_lock_array_rocm_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    rocmFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    rocmMalloc(&ptr,sizeof(int)*(ROCM_SPACE_ATOMIC_MASK+1));
+  return ptr;
+}
+
+int* scratch_lock_array_rocm_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    rocmFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    rocmMalloc(&ptr,sizeof(int)*(ROCm::concurrency()));
+  return ptr;
+}
+
+int* threadid_lock_array_rocm_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    rocmFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    rocmMalloc(&ptr,sizeof(int)*(ROCm::concurrency()));
+  return ptr;
+}
+
+void init_lock_arrays_rocm_space() {
+  static int is_initialized = 0;
+  if(! is_initialized) {
+    Kokkos::Impl::ROCmLockArraysStruct locks;
+    locks.atomic = atomic_lock_array_rocm_space_ptr(false);
+    locks.scratch = scratch_lock_array_rocm_space_ptr(false);
+    locks.threadid = threadid_lock_array_rocm_space_ptr(false);
+    am_copyToSymbol( kokkos_impl_rocm_lock_arrays , & locks , sizeof(ROCmLockArraysStruct) );
+    init_lock_array_kernel_atomic<<<(ROCM_SPACE_ATOMIC_MASK+255)/256,256>>>();
+    init_lock_array_kernel_scratch_threadid<<<(Kokkos::Experimental::ROCm::concurrency()+255)/256,256>>>(Kokkos::Experimental::ROCm::concurrency());
+  }
+}
+#endif 
+
+void* rocm_resize_scratch_space(size_t bytes, bool force_shrink) {
+  static void* ptr = NULL;
+  static size_t current_size = 0;
+  if(current_size == 0) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::ROCmSpace>("ROCmSpace::ScratchMemory",current_size);
+  }
+  if(bytes > current_size) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_realloc<Kokkos::Experimental::ROCmSpace>(ptr,current_size);
+  }
+  if((bytes < current_size) && (force_shrink)) {
+    current_size = bytes;
+    Kokkos::kokkos_free<Kokkos::Experimental::ROCmSpace>(ptr);
+    ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::ROCmSpace>("ROCmSpace::ScratchMemory",current_size);
+  }
+  return ptr;
+}
+
+}
+}
+
+#endif // KOKKOS_ENABLE_ROCM
+
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Task.cpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..894d3248341aa69e0dcaebab1e7711044fb8e6c7
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Task.cpp
@@ -0,0 +1,174 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_ENABLE_ROCM ) && defined( KOKKOS_ENABLE_TASKDAG )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Experimental::ROCm > ;
+
+
+//----------------------------------------------------------------------------
+KOKKOS_INLINE_FUNCTION
+void TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver
+  ( TaskQueueSpecialization< Kokkos::Experimental::ROCm >::queue_type * const queue,
+    hc::tiled_index<3> threadIdx )
+{
+  using Member = TaskExec< Kokkos::Experimental::ROCm > ;
+  using Queue  = TaskQueue< Kokkos::Experimental::ROCm > ;
+  using task_root_type = TaskBase< void , void , void > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  Member single_exec( 1, threadIdx );
+  Member team_exec( threadIdx.tile_dim[0], threadIdx );
+
+  const int wavefront_lane = threadIdx.local[0] + threadIdx.local[1]* threadIdx.tile_dim[0] ;
+
+  union {
+    task_root_type * ptr ;
+    int              raw[2] ;
+  } task ;
+
+  // Loop until all queues are empty and no tasks in flight
+
+  do {
+
+    // Each team lead attempts to acquire either a thread team task
+    // or collection of single thread tasks for the team.
+
+    if ( 0 == wavefront_lane ) {
+
+      task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
+          task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
+        }
+      }
+
+#if 0
+printf("TaskQueue<ROCm>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
+      , uintptr_t(task.ptr));
+#endif
+
+    }
+
+    // shuffle broadcast
+
+    task.raw[0] = hc::__shfl( task.raw[0] , 0 );
+    task.raw[1] = hc::__shfl( task.raw[1] , 0 );
+
+    if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
+
+    if ( end != task.ptr ) {
+      if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
+        // Thread Team Task
+        (*task.ptr->m_apply)( task.ptr , & team_exec );
+      }
+      else if ( 0 == threadIdx.local[1] ) {
+        // Single Thread Task
+        (*task.ptr->m_apply)( task.ptr , & single_exec );
+      }
+
+      if ( 0 == wavefront_lane ) {
+        queue->complete( task.ptr );
+      }
+    }
+  } while(1);
+}
+#if 0
+namespace {
+KOKKOS_INLINE_FUNCTION
+void rocm_task_queue_execute( TaskQueue< Kokkos::Experimental::ROCm > * queue, 
+                              hc::tiled_index<3> threadIdx )
+{ TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver( queue, threadIdx ); }
+
+}
+#endif
+void TaskQueueSpecialization< Kokkos::Experimental::ROCm >::execute
+  ( TaskQueue< Kokkos::Experimental::ROCm > * const queue )
+{
+  const int workgroups_per_wavefront = 4 ;
+  const int wavefront_size = Kokkos::Impl::ROCmTraits::WavefrontSize ;
+  const int cu_count = Kokkos::Impl::rocm_internal_cu_count();
+//  const dim3 grid( Kokkos::Impl::rocm_internal_cu_count() , 1 , 1 );
+//  const dim3 block( 1 , Kokkos::Impl::ROCmTraits::WorkGroupSize , workgroups_per_wavefront );
+
+
+
+  // Query the stack size, in bytes:
+  // If not large enough then set the stack size, in bytes:
+
+// adapted from the cuda code.  TODO: Not at all sure that this is the proper 
+// to map the cuda grid/blocks/3D tiling to HCC
+#if 0
+  hc::extent< 3 > flat_extent(  cu_count,
+                                wavefront_size, workgroups_per_wavefront );
+  hc::tiled_extent< 3 > team_extent = flat_extent.tile(1,
+                                wavefront_size,workgroups_per_wavefront);
+
+  hc::parallel_for_each( team_extent , [&](hc::tiled_index<3> idx) [[hc]]
+  {
+    TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver( queue,idx ); 
+  }).wait();
+#endif
+}
+
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_ROCM ) && defined( KOKKOS_ENABLE_TASKDAG ) */
+
+
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Task.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..379279441ea08f3dbe110611c07b66f793942d67
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Task.hpp
@@ -0,0 +1,458 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ROCM_TASK_HPP
+#define KOKKOS_IMPL_ROCM_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+#include <ROCm/Kokkos_ROCm_Vectorization.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class > class TaskExec ; 
+
+template<>
+class TaskQueueSpecialization< Kokkos::Experimental::ROCm >
+{
+public:
+
+  using execution_space = Kokkos::Experimental::ROCm ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using member_type     = TaskExec< execution_space > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  static void driver( queue_type * const, hc::tiled_index<3> );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  // Must provide mechanism to set function pointer in
+  // execution space from the host process.
+  template< typename FunctorType >
+  static
+  void proc_set_apply( typename TaskBase< Kokkos::Experimental::ROCm
+                               , typename FunctorType::value_type
+                               , FunctorType
+                               >::function_type * ptr )
+    {
+      using TaskType = TaskBase< Kokkos::Experimental::ROCm
+                               , typename FunctorType::value_type
+                               , FunctorType
+                               > ;
+      hc::extent< 1 > flat_extent( 1 );
+      hc::tiled_extent< 1 > team_extent = flat_extent.tile( 1);
+
+      hc::parallel_for_each( team_extent , [&](hc::tiled_index<1> idx) [[hc]]
+      {
+         *ptr = TaskType::apply ;
+      }).wait();
+    }
+};
+
+/*template<>
+KOKKOS_FUNCTION 
+void TaskQueue<Kokkos::Experimental::ROCm>::decrement( typename TaskQueue<Kokkos::Experimental::ROCm>::task_root_type *
+) {}
+*/
+extern template class TaskQueue< Kokkos::Experimental::ROCm > ;
+
+//----------------------------------------------------------------------------
+/**\brief  Impl::TaskExec<ROCm> is the TaskScheduler<ROCm>::member_type
+ *         passed to tasks running in a ROCm space.
+ *
+ *  ROCm thread blocks for tasking are dimensioned:
+ *    idx.tile_dim[0] == vector length
+ *    idx.tile_dim[1] == team size
+ *    idx.tile_dim[2] == number of teams
+ *  where
+ *    idx.tile_dim[0] * idx.tile_dim[1] == WavefrontSize
+ *
+ *  Both single thread and thread team tasks are run by a full ROCm warp.
+ *  A single thread task is called by warp lane #0 and the remaining
+ *  lanes of the warp are idle.
+ */
+template<>
+class TaskExec< Kokkos::Experimental::ROCm >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Experimental::ROCm > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Experimental::ROCm > ;
+
+  int              m_team_size ;
+  hc::tiled_index<3>      m_idx;
+
+//  KOKKOS_INLINE_FUNCTION TaskExec( int arg_team_size )  //TODO: tile_dim[0]
+//    : m_team_size( arg_team_size ) {}
+
+  KOKKOS_INLINE_FUNCTION TaskExec( int arg_team_size,
+                                   hc::tiled_index<3> tidx)  
+    : m_team_size( arg_team_size),
+      m_idx( tidx ) {}
+
+public:
+//      const auto local = t_idx.local[0];
+//      const auto global = t_idx.global[0];
+//     const auto tile = t_idx.tile[0];
+
+  hc::tiled_index<3> idx() const { return m_idx;}
+
+#if defined( __HCC_ACCELERATOR__ )
+  KOKKOS_INLINE_FUNCTION void team_barrier() { /* __threadfence_block(); */ }
+  KOKKOS_INLINE_FUNCTION int  team_rank() const { return m_idx.local[1] ; } // t_idx.tile[0];
+  KOKKOS_INLINE_FUNCTION int  team_size() const { return m_team_size ; }
+#else
+  KOKKOS_INLINE_FUNCTION void team_barrier() {}
+  KOKKOS_INLINE_FUNCTION int  team_rank() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int  team_size() const { return 0 ; }
+#endif
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >
+TeamThreadRange
+  ( Impl::TaskExec< Kokkos::Experimental::ROCm > & thread, const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread,count);
+}
+
+template<typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
+                                       Impl::TaskExec< Kokkos::Experimental::ROCm > >
+TeamThreadRange
+  ( Impl:: TaskExec< Kokkos::Experimental::ROCm > & thread, const iType1 & begin, const iType2 & end )
+{
+  typedef typename std::common_type<iType1, iType2>::type iType;
+  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread, begin, end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >
+ThreadVectorRange
+  ( Impl::TaskExec< Kokkos::Experimental::ROCm > & thread
+  , const iType & count )
+{
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread,count);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.
+*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries
+  , const Lambda& lambda
+  )
+{
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+// reduce across corresponding lanes between team members within workgroup
+// assume stride*team_size == workgroup_size
+template< typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void strided_shfl_workgroup_reduction
+  (const ValueType& f(),
+   ValueType& val,
+   int team_size,
+   int stride)
+{
+  for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
+    f(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
+  }
+}
+
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void strided_shfl_workgroup_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int team_size,
+   int stride)
+{
+  for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
+    join(val, shfl_down(val, lane_delta, team_size*stride));
+  }
+}
+
+// multiple within-workgroup non-strided reductions
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void multi_shfl_workgroup_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int vec_length)
+{
+  for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
+    join(val, shfl_down(val, lane_delta, vec_length));
+  }
+}
+
+// broadcast within workgroup
+template< class ValueType >
+KOKKOS_INLINE_FUNCTION
+ValueType shfl_workgroup_broadcast
+  (ValueType& val,
+   int src_lane,
+   int width)
+{
+  return shfl(val, src_lane, width);
+}
+
+// all-reduce across corresponding vector lanes between team members within workgroup
+// assume vec_length*team_size == workgroup_size
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+
+template<typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries
+  , const Lambda& lambda
+  , ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+  hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+  initialized_result = result;
+
+  strided_shfl_workgroup_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+                          initialized_result,
+                          loop_boundaries.thread.team_size(),
+                          idx.tile_dim[0]);
+  initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, idx.local[0], Impl::ROCmTraits::WavefrontSize );
+
+}
+
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+   hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  strided_shfl_workgroup_reduction<ValueType, JoinType>(
+                          join,
+                          initialized_result,
+                          loop_boundaries.thread.team_size(),
+                          idx.tile_dim[0]);
+  initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, idx.local[0], Impl::ROCmTraits::WavefrontSize );
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result)
+{
+  ValueType result = initialized_result;
+  hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  initialized_result = result;
+
+  //initialized_result = multi_shfl_workgroup_reduction(
+  multi_shfl_workgroup_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+                          initialized_result,
+                          idx.tile_dim[0]);
+  initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, 0, idx.tile_dim[0] );
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+  hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  multi_shfl_workgroup_reduction<ValueType, JoinType>(join, initialized_result, idx.tile_dim[0]);
+  initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, 0, idx.tile_dim[0] );
+}
+
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+  ValueType accum = 0 ;
+  ValueType val, y, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    val = 0;
+    lambda(i,val,false);
+
+    // intra-idx.tile_dim[0] exclusive scan on 'val'
+    // accum = accumulated, sum in total for this iteration
+
+    // INCLUSIVE scan
+    for( int offset = idx.tile_dim[0] ; offset < Impl::ROCmTraits::WavefrontSize ; offset <<= 1 ) {
+      y = shfl_up(val, offset, Impl::ROCmTraits::WavefrontSize);
+      if(idx.local[1]*idx.tile_dim[0] >= offset) { val += y; }
+    }
+
+    // pass accum to all threads
+    local_total = shfl_workgroup_broadcast<ValueType>(val,
+                                            idx.local[0]+Impl::ROCmTraits::WavefrontSize-idx.tile_dim[0],
+                                            Impl::ROCmTraits::WavefrontSize);
+
+    // make EXCLUSIVE scan by shifting values over one
+    val = shfl_up(val, idx.tile_dim[0], Impl::ROCmTraits::WavefrontSize);
+    if ( idx.local[1] == 0 ) { val = 0 ; }
+
+    val += accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  hc::tiled_index<3> idx = loop_boundaries.thread.idx();
+  ValueType accum = 0 ;
+  ValueType val, y, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    val = 0;
+    lambda(i,val,false);
+
+    // intra-idx.tile_dim[0] exclusive scan on 'val'
+    // accum = accumulated, sum in total for this iteration
+
+    // INCLUSIVE scan
+    for( int offset = 1 ; offset < idx.tile_dim[0] ; offset <<= 1 ) {
+      y = shfl_up(val, offset, idx.tile_dim[0]);
+      if(idx.local[0] >= offset) { val += y; }
+    }
+
+    // pass accum to all threads
+    local_total = shfl_workgroup_broadcast<ValueType>(val, idx.tile_dim[0]-1, 
+                                                 idx.tile_dim[0]);
+
+    // make EXCLUSIVE scan by shifting values over one
+    val = shfl_up(val, 1, idx.tile_dim[0]);
+    if ( idx.local[0] == 0 ) { val = 0 ; }
+
+    val += accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+
+} /* namespace Kokkos */
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
+#endif /* #ifndef KOKKOS_IMPL_ROCM_TASK_HPP */
+
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Tile.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Tile.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..12fe72116602a2bdb485f9b36fd7b18bda4de03a
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Tile.hpp
@@ -0,0 +1,518 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <hc.hpp>
+#include <type_traits>
+#include <vector>
+#include <memory>
+#include <ROCm/Kokkos_ROCm_Config.hpp>
+
+#if !defined( KOKKOS_ROCM_TILE_H )
+#define KOKKOS_ROCM_TILE_H
+
+// Macro to abstract out the enable_if craziness
+#define KOKKOS_ROCM_REQUIRES(...) \
+    bool KokkosROCmRequiresBool ## __LINE__ = true, typename std::enable_if<KokkosROCmRequiresBool ## __LINE__ && (__VA_ARGS__), int>::type = 0
+
+// This number uniquely identifies the 1.5 release build.  
+#if __hcc_workweek__ > 17160
+#define ROCM15 1
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+template<class T>
+
+#if defined(ROCM15)
+using lds_t = T;
+#else
+// prior to 1.5, needed to decorate LDS addresses
+using lds_t = __attribute__((address_space(3))) T;
+#endif
+
+#define KOKKOS_ROCM_TILE_RESTRIC_CPU restrict(cpu, amp)
+
+// a set of routines to the replace the std::routines
+// that will operate on address space 3 types
+
+#if defined(ROCM15)
+// 1.5 can't use std::copy et al for LDS access, so we define our own 
+// set of routines
+template<class I, class O>
+void rcopy(I first, I last, O out) [[hc]]
+{
+    while (first != last) *out++ = *first++;
+}
+template<class I,class F>
+void rfor_each(I first, I last, F f) [[hc]]
+{
+  for(;first!=last;++first) f(*first);
+}
+
+template<class I,class O,class F>
+void rtransform(I first, I last, O out, F f) [[hc]]
+{
+  while(first!=last) *out++ = f(*first++);
+}
+#endif
+
+
+inline std::size_t get_max_tile_size() KOKKOS_ROCM_TILE_RESTRIC_CPU
+{
+    return hc::accelerator().get_max_tile_static_size() - 1024;
+}
+
+inline std::size_t get_max_tile_thread() KOKKOS_ROCM_TILE_RESTRIC_CPU
+{
+    return 64;
+}
+
+inline int next_pow_2(int x) restrict(cpu, amp)
+{ 
+    --x;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    return x+1;
+}
+
+template<class T>
+inline std::size_t get_tile_size(std::size_t n = 1,
+                                 std::size_t team = 64, 
+                                 std::size_t vector = 1) 
+                                 KOKKOS_ROCM_TILE_RESTRIC_CPU
+{
+    const auto size = sizeof(T) * n;
+    const auto group_size = get_max_tile_size();
+    if (size == 0 || size > group_size) return 0;
+    // Assume that thread size is a power of 2
+    auto thread_size = std::min(team*vector,4*get_max_tile_thread());
+    // ensure that we have enough tile static memory to keep
+    // threadsize * size elements for reductions
+    while(size > (group_size / thread_size) && thread_size > 2)
+{ thread_size /= 2;
+}
+    return thread_size;
+}
+
+template<class T>
+struct array_view
+{
+    T* x;
+    std::size_t n;
+
+    array_view(T* xp, std::size_t np) [[hc]] [[cpu]] 
+    : x(xp), n(np)
+    {}
+
+    array_view(T* xp, T* yp) [[hc]] [[cpu]] 
+    : x(xp), n(yp-xp)
+    {}
+
+    T& operator[](std::size_t i) const [[hc]] [[cpu]]
+    {
+        return x[i];
+    }
+
+    std::size_t size() const [[hc]] [[cpu]]
+    {
+        return this->n;
+    }
+
+    T* data() const [[hc]] [[cpu]]
+    {
+        return x;
+    }
+
+    T* begin() const [[hc]] [[cpu]]
+    {
+        return x;
+    }
+
+    T* end() const [[hc]] [[cpu]]
+    {
+        return x+this->size();
+    }
+};
+
+template<class T>
+struct rocm_char
+{ using type=char; };
+
+template<class T>
+struct rocm_char<const T>
+: std::add_const<typename rocm_char<T>::type>
+{};
+#if !defined(ROCM15)
+// earlier compilers required explicit address space decorations
+template<class T>
+struct rocm_char<__attribute__((address_space(3))) T>
+{ using type = __attribute__((address_space(3))) typename rocm_char<T>::type; };
+
+template<class T>
+struct rocm_char<const __attribute__((address_space(3))) T>
+{ using type = const __attribute__((address_space(3))) typename rocm_char<T>::type; };
+#endif
+
+template<class T, class Char=typename rocm_char<T>::type>
+Char* rocm_byte_cast(T& x) restrict(cpu, amp)
+{
+    return reinterpret_cast<Char*>(&x);
+}
+
+template<class T, class U>
+void rocm_raw_assign(T& x, const U& y) restrict(cpu, amp)
+{
+    auto * src = rocm_byte_cast(y);
+    auto * dest = rocm_byte_cast(x);
+#if defined (ROCM15)
+    rcopy(src, src+sizeof(T), dest);
+#else
+    std::copy(src, src+sizeof(T), dest);
+#endif
+}
+
+template<class T, class U>
+void rocm_assign_impl(T& x, const U& y, std::true_type) restrict(cpu, amp)
+{
+    rocm_raw_assign(x, y);
+}
+
+template<class T, class U>
+void rocm_assign_impl(T& x, const U& y, std::false_type) restrict(cpu, amp)
+{
+    x = y;
+}
+
+// Workaround for assigning in and out of LDS memory
+template<class T, class U>
+void rocm_assign(T& x, const U& y) restrict(cpu, amp)
+{
+    rocm_assign_impl(x, y, std::integral_constant<bool, (
+        sizeof(T) == sizeof(U)
+    )>());
+}
+
+// Compute the address space of tile
+template<class T>
+struct tile_type
+{
+#if defined (ROCM15)
+    typedef T type;
+#else
+    typedef __attribute__((address_space(3))) T type;
+#endif
+};
+
+#if !defined (ROCM15)
+template<class T, class Body>
+void lds_for(__attribute__((address_space(3))) T& value, Body b) [[hc]]
+{
+    T state = value;
+    b(state);
+    value = state;
+}
+#endif
+
+template<class T, class Body>
+void lds_for(T& value, Body b) [[hc]]
+{
+    b(value);
+}
+
+
+constexpr std::size_t get_max_tile_array_size()
+{
+    return 24;
+}
+
+template<class Derived, class T>
+struct single_action
+{
+    template<class Action>
+    void action_at(std::size_t i, Action a) [[hc]]
+    {
+        auto& value = static_cast<Derived&>(*this)[i];
+#if KOKKOS_ROCM_HAS_WORKAROUNDS
+        T state = value;
+        a(state);
+        value = state;
+#else
+        a(value);
+#endif
+    }
+
+    template<class Action>
+    void action_at(std::size_t i, std::size_t j, Action a) [[hc]]
+    {
+        static_cast<Derived&>(*this).action_at(i, [&](T& x)
+        {
+            static_cast<Derived&>(*this).action_at(j, [&](T& y)
+            {
+                a(x, y);
+            });
+        });
+    }
+};
+
+template<class T>
+struct tile_buffer
+: array_view<typename tile_type<T>::type>, single_action<tile_buffer<T>, T>
+{
+    typedef typename tile_type<T>::type element_type;
+    typedef array_view<element_type> base;
+
+    using base::base;
+
+    tile_buffer(element_type* xp, std::size_t np, std::size_t) [[hc]] [[cpu]] 
+    : base(xp, np)
+    {}
+
+    tile_buffer(T* xp, T* yp, std::size_t) [[hc]] [[cpu]] 
+    : base(xp, yp)
+    {}
+};
+
+template<class T>
+struct tile_buffer<T[]>
+{
+    typedef typename tile_type<T>::type element_type;
+    typedef typename tile_type<char>::type tchar_type;
+    element_type* element_data;
+    std::size_t n, m;
+
+    tile_buffer(element_type* xp, std::size_t np, std::size_t mp) [[hc]] [[cpu]] 
+    : element_data(xp), n(np), m(mp)
+    {}
+
+    tile_buffer(element_type* xp, element_type* yp, std::size_t mp) [[hc]] [[cpu]] 
+    : element_data(xp), n(yp-xp), m(mp)
+    {}
+
+    element_type* operator[](std::size_t i) const [[hc]] [[cpu]]
+    {
+        return element_data+i*m;
+    }
+
+    template<class Action, class Q = T>
+    typename Impl::enable_if< (sizeof(Q) <= 8) , void >::type
+    action_at(std::size_t i, Action a) [[hc]]
+    {
+        element_type* value = (*this)[i];
+#if defined (ROCM15)
+        a(value);
+#else
+#if KOKKOS_ROCM_HAS_WORKAROUNDS
+        if (m > get_max_tile_array_size()) return;
+        T state[get_max_tile_array_size()];
+        // std::copy(value, value+m, state);
+        // Workaround for assigning from LDS memory
+        std::transform(value, value+m, state, [](element_type& x)
+        {
+          T result;
+          rocm_assign(result, x);
+          return result;
+        });
+        a(state);
+        std::copy(state, state+m, value);
+#endif
+#endif
+    }
+
+    template<class Action, class Q = T>
+    typename Impl::enable_if< !(sizeof(Q) <= 8) , void >::type
+    action_at(std::size_t i, Action a) [[hc]]
+    {
+        element_type* value = (*this)[i];
+#if defined (ROCM15)
+        a(value);
+#else
+//#if KOKKOS_ROCM_HAS_WORKAROUNDS
+        if (m > get_max_tile_array_size()) return;
+        T state[get_max_tile_array_size()];
+        // std::copy(value, value+m, state);
+        // Workaround for assigning from LDS memory
+        std::transform(value, value+m, state, [](element_type& x)
+        {
+          T result;
+          rocm_assign(result, x);
+          return result;
+        });
+        a(state);
+        // this workaround required when T is greater than 8 bytes
+        tile_static char tv[64*sizeof(T)];
+        size_t sT = sizeof(T);
+        for (int j = 0; j<sT; j++) tv[i*sT+j] = ((char *)state)[j];
+        for (int j = 0; j<sT; j++) ((tchar_type *)value)[j] = tv[i*sT+j];
+#endif
+    }
+
+    template<class Action>
+    void action_at(std::size_t i, std::size_t j, Action a) [[hc]]
+    {
+        this->action_at(i, [&](T* x)
+        {
+            this->action_at(j, [&](T* y)
+            {
+                a(x, y);
+            });
+        });
+    }
+
+    std::size_t size() const [[hc]] [[cpu]]
+    {
+        return this->n;
+    }
+
+    element_type* data() const [[hc]] [[cpu]]
+    {
+        return element_data;
+    }
+};
+
+// Zero initialize LDS memory
+struct zero_init_f
+{
+    template<class T>
+#if defined (ROCM15)
+    void operator()(T& x, std::size_t=1) const [[hc]]
+    {
+        auto * start = reinterpret_cast<char*>(&x);
+        for(int i=0; i<sizeof(T);i++) start[i] = 0;
+        rocm_raw_assign(x, T());
+    }
+#else
+    void operator()(__attribute__((address_space(3))) T& x, std::size_t=1) const [[hc]]
+    {
+        auto * start = reinterpret_cast<__attribute__((address_space(3))) char*>(&x);
+        std::fill(start, start+sizeof(T), 0);
+        rocm_raw_assign(x, T());
+    }
+#endif
+
+    template<class T>
+#if defined (ROCM15)
+    void operator()(T* x, std::size_t size) const [[hc]]
+    {
+        rfor_each(x, x+size, *this);
+    }
+#else
+    void operator()(__attribute__((address_space(3))) T* x, std::size_t size) const [[hc]]
+    {
+        std::for_each(x, x+size, *this);
+    }
+#endif
+};
+
+static constexpr zero_init_f zero_init = {};
+
+struct tile_desc
+{
+    // Number of work items, or size of extent
+    std::size_t elements;
+    // number of threads in team 
+    std::size_t team_size;
+    // vector length of team
+    std::size_t vector_length;
+    // Size of tile
+    std::size_t tile_size;
+    // Size of array
+    std::size_t array_size;
+    // Number of tiles
+    std::size_t num_tiles;
+    // Per team reserved LDS memory, used for reduction
+    std::size_t reduce_size;
+    // Per team shared memory in LDS, this in addition to reduce shared mem
+    std::size_t shared_size;
+    std::size_t size;
+};
+
+template<class T>
+tile_desc get_tile_desc(std::size_t size, 
+                        std::size_t array_size=1,
+                        std::size_t team_size=64,
+                        std::size_t vector_size=1,
+                        std::size_t shared_size=0)
+{
+    tile_desc result;
+    result.elements = size;
+    result.array_size = array_size;
+    result.vector_length = vector_size;
+    result.team_size = team_size;
+    result.tile_size = get_tile_size<T>(array_size,team_size,vector_size);
+    result.num_tiles = std::ceil(1.0 * size / result.tile_size);
+    result.reduce_size = result.tile_size * sizeof(T) * array_size;
+    result.shared_size = shared_size;
+    result.size = result.tile_size * result.num_tiles;
+
+    return result;
+}
+
+template<class U, class F, class T=typename std::remove_extent<U>::type>
+hc::completion_future tile_for(tile_desc td, F f) 
+{
+    assert(td.array_size <= get_max_tile_array_size() && "Exceed max array size");
+    assert(((td.size % td.tile_size) == 0) && "Tile size must be divisible by extent");
+    auto grid = hc::extent<1>(td.size).tile_with_dynamic(
+                          td.tile_size, td.reduce_size + td.shared_size);
+    // grid.set_dynamic_group_segment_size(td.reduce_size + td.shared_size);
+    return parallel_for_each(grid, [=](hc::tiled_index<1> t_idx) [[hc]] 
+    {
+#if defined (ROCM15)
+        typedef T group_t;
+#else
+        typedef __attribute__((address_space(3))) T group_t;
+#endif
+        group_t * buffer = (group_t *)hc::get_dynamic_group_segment_base_pointer();
+        tile_buffer<U> tb(buffer, td.tile_size, td.array_size);
+        zero_init(tb[t_idx.local[0]], td.array_size);
+        f(t_idx, tb);
+    });
+}
+
+}}
+
+#endif
diff --git a/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Vectorization.hpp b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Vectorization.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..36f886109c495564ce2e7b57f73df896a2adc598
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/Kokkos_ROCm_Vectorization.hpp
@@ -0,0 +1,346 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ROCM_VECTORIZATION_HPP
+#define KOKKOS_ROCM_VECTORIZATION_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if ROCM is enabled for Kokkos */
+#ifdef KOKKOS_ENABLE_ROCM
+
+#include <Kokkos_ROCm.hpp>
+
+namespace Kokkos {
+using namespace hc;
+
+// Shuffle only makes sense on >= Fiji GPUs; it doesn't work on CPUs
+// or other GPUs.  We provide a generic definition (which is trivial
+// and doesn't do what it claims to do) because we don't actually use
+// this function unless we are on a suitable GPU, with a suitable
+// Scalar type.  (For example, in the mat-vec, the "ThreadsPerRow"
+// internal parameter depends both on the ExecutionSpace and the Scalar type,
+// and it controls whether shfl_down() gets called.)
+namespace Impl {
+
+  template< typename Scalar >
+  struct shfl_union {
+    enum {n = sizeof(Scalar)/4};
+    float fval[n];
+    KOKKOS_INLINE_FUNCTION
+    Scalar value() {
+      return *(Scalar*) fval;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (const Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+
+  };
+}
+
+#ifdef __HCC_ACCELERATOR__
+
+    KOKKOS_INLINE_FUNCTION
+    int __long2loint(const long val ) {
+    union {
+      long l;
+      int i[2];
+    } u;
+      u.l = val;
+      return u.i[0];
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int __long2hiint(const long val ) {
+    union {
+      long l;
+      int i[2];
+    } u;
+      u.l = val;
+      return u.i[1];
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int __double2loint(const double val ) {
+    union {
+      double d;
+      int i[2];
+    } u;
+      u.d = val;
+      return u.i[0];
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int __double2hiint(const double val ) {
+    union {
+      double d;
+      int i[2];
+    } u;
+      u.d = val;
+      return u.i[1];
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    long __hiloint2long(const int hi, const int lo ) {
+    union {
+      long l;
+      int i[2];
+    } u;
+      u.i[0] = lo;
+      u.i[1] = hi;
+      return u.l;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double __hiloint2double(const int hi, const int lo ) {
+    union {
+      double d;
+      int i[2];
+    } u;
+      u.i[0] = lo;
+      u.i[1] = hi;
+      return u.d;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl(const int &val, const int& srcLane, const int& width ) {
+      return __shfl(val,srcLane,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl(const float &val, const int& srcLane, const int& width ) {
+      return __shfl(val,srcLane,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
+        ) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl(tmp,srcLane,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl(const double &val, const int& srcLane, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl(lo,srcLane,width);
+      hi = __shfl(hi,srcLane,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl(lo,srcLane,width);
+      hi = __shfl(hi,srcLane,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_down(const int &val, const int& delta, const int& width) {
+      return __shfl_down(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_down(const float &val, const int& delta, const int& width) {
+      return __shfl_down(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl_down(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    long shfl_down(const long &val, const int& delta, const int& width) {
+      int lo = __long2loint(val);
+      int hi = __long2hiint(val);
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      return __hiloint2long(hi,lo);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_down(const double &val, const int& delta, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_up(const int &val, const int& delta, const int& width ) {
+      return __shfl_up(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_up(const float &val, const int& delta, const int& width ) {
+      return __shfl_up(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl_up(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_up(const double &val, const int& delta, const int& width ) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl_up(lo,delta,width);
+      hi = __shfl_up(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl_up(lo,delta,width);
+      hi = __shfl_up(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+#else
+    template<typename Scalar>
+    inline
+    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<8.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<8.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<8.0.");
+      return val;
+    }
+#endif
+
+
+
+}
+
+#endif // KOKKOS_ENABLE_ROCM
+#endif
diff --git a/packages/kokkos/core/src/ROCm/hc_math_std.hpp b/packages/kokkos/core/src/ROCm/hc_math_std.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..56c2e634e45b7e7e6ae9af8e244a017217e21850
--- /dev/null
+++ b/packages/kokkos/core/src/ROCm/hc_math_std.hpp
@@ -0,0 +1,367 @@
+#pragma once
+
+#include "hc.hpp"
+#include <cmath>
+
+// Math functions with integer overloads will be converted to
+// this floating point type.
+#define HC_IMPLICIT_FLOAT_CONV double
+
+#ifdef __KALMAR_ACCELERATOR__
+
+#define HC_MATH_WRAPPER_1(function, arg1) \
+template<typename T> \
+inline T function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define KALMAR_MATH_WRAPPER_1(function, arg1) HC_MATH_WRAPPER_1(function, arg1)
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
+} \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_floating_point <T>::value,T>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) 
+
+#define HC_MATH_WRAPPER_2(function, arg1, arg2) \
+template<typename T> \
+inline T function(T arg1, T arg2) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1, arg2); \
+}
+
+#define HC_MATH_ALIAS_2(alias, function, arg1, arg2) \
+template<typename T> \
+inline T alias(T arg1, T arg2) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1, arg2); \
+}
+
+#define HC_MATH_WRAPPER_3(function, arg1, arg2, arg3) \
+template<typename T> \
+inline T function(T arg1, T arg2, T arg3) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1, arg2, arg3); \
+}
+
+#define HC_MATH_WRAPPER_TQ(function, arg1) \
+template<typename T, typename Q> \
+inline T function(Q arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_TQ(function, T, arg1) \
+template<typename Q> \
+inline \
+typename std::enable_if<std::is_integral<Q>::value,T>::type \
+function(Q arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
+}\
+template<typename Q> \
+inline \
+typename std::enable_if<std::is_floating_point<Q>::value,T>::type \
+function(Q arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_TTQ(function, arg1, arg2) \
+template<typename T, typename Q> \
+inline T function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1, arg2); \
+}
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(function, arg1, arg2) \
+template<typename T, typename Q> \
+inline \
+typename std::enable_if<std::is_integral<T>::value||std::is_integral<Q>::value,HC_IMPLICIT_FLOAT_CONV>::type \
+function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1),static_cast<HC_IMPLICIT_FLOAT_CONV>(arg2)); \
+}\
+template<typename T, typename Q> \
+inline \
+typename std::enable_if<std::is_floating_point<T>::value&&std::is_floating_point<Q>::value,T>::type \
+function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1,arg2); \
+}
+
+#define HC_MATH_WRAPPER_TTTQ(function, arg1, arg2, arg3) \
+template<typename T, typename Q> \
+inline T function(T arg1, T arg2, Q arg3) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1, arg2, arg3); \
+}
+
+#define HC_MATH_WRAPPER_VTQQ(function, arg1, arg2, arg3) \
+template<typename T, typename Q> \
+inline void function(T arg1, Q arg2, Q arg3) __attribute__((hc,cpu)) { \
+  hc::precise_math::function(arg1, arg2, arg3); \
+}
+
+#else
+
+#define HC_MATH_WRAPPER_1(function, arg1) \
+template<typename T> \
+inline T function(T arg1) __attribute__((hc,cpu)) { \
+  return std::function(arg1); \
+}
+
+#define KALMAR_MATH_WRAPPER_1(function, arg1) \
+template<typename T> \
+inline T function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return ::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
+} \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_floating_point <T>::value,T>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return std::function(arg1); \
+} 
+
+#define KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
+} \
+template<typename T> \
+inline \
+typename std::enable_if<std::is_floating_point <T>::value,T>::type \
+ function(T arg1) __attribute__((hc,cpu)) { \
+  return hc::precise_math::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_2(function, arg1, arg2) \
+template<typename T> \
+inline T function(T arg1, T arg2) __attribute__((hc,cpu)) { \
+  return std::function(arg1, arg2); \
+}
+
+#define HC_MATH_ALIAS_2(alias, function, arg1, arg2) \
+template<typename T> \
+inline T alias(T arg1, T arg2) __attribute__((hc,cpu)) { \
+  return std::function(arg1, arg2); \
+}
+
+#define HC_MATH_WRAPPER_3(function, arg1, arg2, arg3) \
+template<typename T> \
+inline T function(T arg1, T arg2, T arg3) __attribute__((hc,cpu)) { \
+  return std::function(arg1, arg2, arg3); \
+}
+
+#define HC_MATH_WRAPPER_TQ(function, arg1) \
+template<typename T, typename Q> \
+inline T function(Q arg1) __attribute__((hc,cpu)) { \
+  return std::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_TQ(function, T, arg1) \
+template<typename Q> \
+inline \
+typename std::enable_if<std::is_integral<Q>::value,T>::type \
+function(Q arg1) __attribute__((hc)) { \
+  return std::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
+}\
+template<typename Q> \
+inline \
+typename std::enable_if<std::is_floating_point<Q>::value,T>::type \
+function(Q arg1) __attribute__((hc)) { \
+  return std::function(arg1); \
+}
+
+#define HC_MATH_WRAPPER_TTQ(function, arg1, arg2) \
+template<typename T, typename Q> \
+inline T function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return std::function(arg1, arg2); \
+}
+
+#define HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(function, arg1, arg2) \
+template<typename T, typename Q> \
+inline \
+typename std::enable_if<std::is_integral<T>::value||std::is_integral<Q>::value,HC_IMPLICIT_FLOAT_CONV>::type \
+function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return std::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1),static_cast<HC_IMPLICIT_FLOAT_CONV>(arg2)); \
+}\
+template<typename T, typename Q> \
+inline \
+typename std::enable_if<std::is_floating_point<T>::value&&std::is_floating_point<Q>::value,T>::type \
+function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
+  return std::function(arg1,arg2); \
+}
+
+#define HC_MATH_WRAPPER_TTTQ(function, arg1, arg2, arg3) \
+template<typename T, typename Q> \
+inline T function(T arg1, T arg2, Q arg3) __attribute__((hc,cpu)) { \
+  return std::function(arg1, arg2, arg3); \
+}
+
+#define HC_MATH_WRAPPER_VTQQ(function, arg1, arg2, arg3) \
+template<typename T, typename Q> \
+inline void function(T arg1, Q arg2, Q arg3) __attribute__((hc,cpu)) { \
+  std::function(arg1, arg2, arg3); \
+}
+
+#endif
+
+
+// override global math functions
+namespace std {
+
+// following math functions are NOT available because they don't have a GPU implementation
+//
+// erfinv
+// erfcinv
+// fpclassify
+// 
+// following math functions are NOT available because they don't have a CPU implementation
+//
+// cospif
+// cospi
+// rsqrtf
+// rsqrt
+// sinpif
+// sinpi
+// tanpi
+//
+
+HC_MATH_WRAPPER_TQ(ilogbf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_TQ(ilogb, int, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isfinite, bool, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isinf, bool, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isnan, bool, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isnormal, bool, x)
+HC_MATH_WRAPPER_TQ(nanf, tagp)
+HC_MATH_WRAPPER_TQ(nan, tagp)
+//HC_MATH_WRAPPER_TQ(signbitf, x)
+HC_MATH_WRAPPER_TQ(signbit, x)
+HC_MATH_WRAPPER_TTQ(frexpf, x, exp)
+HC_MATH_WRAPPER_TTQ(frexp, x, exp)
+HC_MATH_WRAPPER_TTQ(ldexpf, x, exp)
+HC_MATH_WRAPPER_TTQ(ldexp, x, exp)
+HC_MATH_WRAPPER_TTQ(lgammaf, x, exp)
+HC_MATH_WRAPPER_TTQ(lgamma, x, exp)
+HC_MATH_WRAPPER_TTQ(modff, x, exp)
+HC_MATH_WRAPPER_TTQ(modf, x, exp)
+HC_MATH_WRAPPER_TTQ(scalbnf, x, exp)
+HC_MATH_WRAPPER_TTQ(scalbn, x, exp)
+HC_MATH_WRAPPER_TTTQ(remquof, x, y, quo)
+HC_MATH_WRAPPER_TTTQ(remquo, x, y, quo)
+HC_MATH_WRAPPER_VTQQ(sincosf, x, s, c)
+HC_MATH_WRAPPER_VTQQ(sincos, x, s, c)
+
+HC_MATH_WRAPPER_1(acosf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(acos, x)
+HC_MATH_WRAPPER_1(acoshf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(acosh, x)
+HC_MATH_WRAPPER_1(asinf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(asin, x)
+HC_MATH_WRAPPER_1(asinhf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(asinh, x)
+HC_MATH_WRAPPER_1(atanf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(atan, x)
+HC_MATH_WRAPPER_1(atanhf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(atanh, x)
+HC_MATH_WRAPPER_2(atan2f, x, y)
+HC_MATH_WRAPPER_2(atan2, x, y)
+HC_MATH_WRAPPER_1(cbrtf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(cbrt, x)
+HC_MATH_WRAPPER_1(ceilf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(ceil, x)
+HC_MATH_WRAPPER_2(copysignf, x, y)
+HC_MATH_WRAPPER_2(copysign, x, y)
+HC_MATH_WRAPPER_1(cosf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(cos, x)
+HC_MATH_WRAPPER_1(coshf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(cosh, x)
+KALMAR_MATH_WRAPPER_1(cospif, x)
+KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(cospi, x)
+HC_MATH_WRAPPER_1(erff, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(erf, x)
+HC_MATH_WRAPPER_1(erfcf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(erfc, x)
+HC_MATH_WRAPPER_1(expf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(exp, x)
+HC_MATH_WRAPPER_1(exp2f, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(exp2, x)
+HC_MATH_WRAPPER_1(exp10f, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(exp10, x)
+HC_MATH_WRAPPER_1(expm1f, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(expm1, x)
+HC_MATH_WRAPPER_1(fabsf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(fabs, x)
+HC_MATH_WRAPPER_2(fdimf, x, y)
+HC_MATH_WRAPPER_2(fdim, x, y)
+HC_MATH_WRAPPER_1(floorf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(floor, x)
+HC_MATH_WRAPPER_3(fmaf, x, y, z)
+HC_MATH_WRAPPER_3(fma, x, y, z)
+HC_MATH_WRAPPER_2(fmaxf, x, y)
+HC_MATH_WRAPPER_2(fmax, x, y)
+HC_MATH_WRAPPER_2(fminf, x, y)
+HC_MATH_WRAPPER_2(fmin, x, y)
+HC_MATH_WRAPPER_2(fmodf, x, y)
+HC_MATH_WRAPPER_2(fmod, x, y)
+HC_MATH_WRAPPER_2(hypotf, x, y)
+HC_MATH_WRAPPER_2(hypot, x, y)
+HC_MATH_WRAPPER_1(logf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(log, x)
+HC_MATH_WRAPPER_1(log10f, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(log10, x)
+HC_MATH_WRAPPER_1(log2f, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(log2, x)
+HC_MATH_WRAPPER_1(log1pf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(log1p, x)
+HC_MATH_WRAPPER_1(logbf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(logb, x)
+HC_MATH_WRAPPER_1(nearbyintf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(nearbyint, x)
+HC_MATH_WRAPPER_2(nextafterf, x, y)
+HC_MATH_WRAPPER_2(nextafter, x, y)
+HC_MATH_WRAPPER_2(powf, x, y)
+HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(pow,x,y)
+//HC_MATH_WRAPPER_1(rcbrtf, x)
+//HC_MATH_WRAPPER_1(rcbrt, x)
+HC_MATH_WRAPPER_2(remainderf, x, y)
+HC_MATH_WRAPPER_2(remainder, x, y)
+HC_MATH_WRAPPER_1(roundf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(round, x)
+KALMAR_MATH_WRAPPER_1(rsqrtf, x)
+KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(rsqrt, x)
+HC_MATH_WRAPPER_2(scalbf, x, exp)
+HC_MATH_WRAPPER_2(scalb, x, exp)
+HC_MATH_WRAPPER_1(sinf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(sin, x)
+HC_MATH_WRAPPER_1(sinhf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(sinh, x)
+KALMAR_MATH_WRAPPER_1(sinpif, x)
+KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(sinpi, x)
+HC_MATH_WRAPPER_1(sqrtf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(sqrt, x)
+HC_MATH_WRAPPER_1(tgammaf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(tgamma, x)
+HC_MATH_WRAPPER_1(tanf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(tan, x)
+HC_MATH_WRAPPER_1(tanhf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(tanh, x)
+HC_MATH_WRAPPER_1(truncf, x)
+HC_MATH_WRAPPER_FP_OVERLOAD_1(trunc, x)
+
+//HC_MATH_ALIAS_2(min, fmin, x, y)
+//HC_MATH_ALIAS_2(max, fmax, x, y)
+
+} // namespace
+
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d0514f471b1524c55b2cc8d49f3674410333b94
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -0,0 +1,832 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_THREADS )
+
+#include <cstdint>
+#include <limits>
+#include <utility>
+#include <iostream>
+#include <sstream>
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_CPUDiscovery.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+ThreadsExec                  s_threads_process ;
+ThreadsExec                * s_threads_exec[  ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
+pthread_t                    s_threads_pid[   ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
+std::pair<unsigned,unsigned> s_threads_coord[ ThreadsExec::MAX_THREAD_COUNT ];
+
+int s_thread_pool_size[3] = { 0 , 0 , 0 };
+
+unsigned s_current_reduce_size = 0 ;
+unsigned s_current_shared_size = 0 ;
+
+void (* volatile s_current_function)( ThreadsExec & , const void * );
+const void * volatile s_current_function_arg = 0 ;
+
+struct Sentinel {
+  Sentinel()
+  {}
+
+  ~Sentinel()
+  {
+    if ( s_thread_pool_size[0] ||
+         s_thread_pool_size[1] ||
+         s_thread_pool_size[2] ||
+         s_current_reduce_size ||
+         s_current_shared_size ||
+         s_current_function ||
+         s_current_function_arg ||
+         s_threads_exec[0] ) {
+      std::cerr << "ERROR : Process exiting without calling Kokkos::Threads::terminate()" << std::endl ;
+    }
+  }
+};
+
+inline
+unsigned fan_size( const unsigned rank , const unsigned size )
+{
+  const unsigned rank_rev = size - ( rank + 1 );
+  unsigned count = 0 ;
+  for ( unsigned n = 1 ; ( rank_rev + n < size ) && ! ( rank_rev & n ) ; n <<= 1 ) { ++count ; }
+  return count ;
+}
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void execute_function_noop( ThreadsExec & , const void * ) {}
+
+void ThreadsExec::driver(void)
+{
+  SharedAllocationRecord< void, void >::tracking_enable();
+
+  ThreadsExec this_thread ;
+
+  while ( ThreadsExec::Active == this_thread.m_pool_state ) {
+
+    (*s_current_function)( this_thread , s_current_function_arg );
+
+    // Deactivate thread and wait for reactivation
+    this_thread.m_pool_state = ThreadsExec::Inactive ;
+
+    wait_yield( this_thread.m_pool_state , ThreadsExec::Inactive );
+  }
+}
+
+ThreadsExec::ThreadsExec()
+  : m_pool_base(0)
+  , m_scratch(0)
+  , m_scratch_reduce_end(0)
+  , m_scratch_thread_end(0)
+  , m_numa_rank(0)
+  , m_numa_core_rank(0)
+  , m_pool_rank(0)
+  , m_pool_size(0)
+  , m_pool_fan_size(0)
+  , m_pool_state( ThreadsExec::Terminating )
+{
+  if ( & s_threads_process != this ) {
+
+    // A spawned thread
+
+    ThreadsExec * const nil = 0 ;
+
+    // Which entry in 's_threads_exec', possibly determined from hwloc binding
+    const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0])
+                    ? ((size_t)s_current_function_arg)
+                    : size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord ));
+
+    // Given a good entry set this thread in the 's_threads_exec' array
+    if ( entry < s_thread_pool_size[0] &&
+         nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) {
+
+      const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+      m_numa_rank       = coord.first ;
+      m_numa_core_rank  = coord.second ;
+      m_pool_base       = s_threads_exec ;
+      m_pool_rank       = s_thread_pool_size[0] - ( entry + 1 );
+      m_pool_rank_rev   = s_thread_pool_size[0] - ( pool_rank() + 1 );
+      m_pool_size       = s_thread_pool_size[0] ;
+      m_pool_fan_size   = fan_size( m_pool_rank , m_pool_size );
+      m_pool_state      = ThreadsExec::Active ;
+
+      s_threads_pid[ m_pool_rank ] = pthread_self();
+
+      // Inform spawning process that the threads_exec entry has been set.
+      s_threads_process.m_pool_state = ThreadsExec::Active ;
+    }
+    else {
+      // Inform spawning process that the threads_exec entry could not be set.
+      s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+    }
+  }
+  else {
+    // Enables 'parallel_for' to execute on unitialized Threads device
+    m_pool_rank  = 0 ;
+    m_pool_size  = 1 ;
+    m_pool_state = ThreadsExec::Inactive ;
+
+    s_threads_pid[ m_pool_rank ] = pthread_self();
+  }
+}
+
+ThreadsExec::~ThreadsExec()
+{
+  const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
+
+  typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+
+  if ( m_scratch ) {
+    Record * const r = Record::get_record( m_scratch );
+
+    m_scratch = 0 ;
+
+    Record::decrement( r );
+  }
+
+  m_pool_base   = 0 ;
+  m_scratch_reduce_end = 0 ;
+  m_scratch_thread_end = 0 ;
+  m_numa_rank      = 0 ;
+  m_numa_core_rank = 0 ;
+  m_pool_rank      = 0 ;
+  m_pool_size      = 0 ;
+  m_pool_fan_size  = 0 ;
+
+  m_pool_state  = ThreadsExec::Terminating ;
+
+  if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) {
+    ThreadsExec * const nil = 0 ;
+
+    atomic_compare_exchange( s_threads_exec + entry , this , nil );
+
+    s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+  }
+}
+
+
+int ThreadsExec::get_thread_count()
+{
+  return s_thread_pool_size[0] ;
+}
+
+ThreadsExec * ThreadsExec::get_thread( const int init_thread_rank )
+{
+  ThreadsExec * const th =
+    init_thread_rank < s_thread_pool_size[0]
+    ? s_threads_exec[ s_thread_pool_size[0] - ( init_thread_rank + 1 ) ] : 0 ;
+
+  if ( 0 == th || th->m_pool_rank != init_thread_rank ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : "
+        << "thread " << init_thread_rank << " of " << s_thread_pool_size[0] ;
+    if ( 0 == th ) {
+      msg << " does not exist" ;
+    }
+    else {
+      msg << " has wrong thread_rank " << th->m_pool_rank ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return th ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
+{
+  ThreadsExec::global_lock();
+  ThreadsExec::global_unlock();
+
+  const int n = exec.m_pool_fan_size ;
+  const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
+
+  for ( int i = 0 ; i < n ; ++i ) {
+    Impl::spinwait_while_equal<int>( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+  }
+
+  exec.m_pool_state = ThreadsExec::Inactive ;
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void ThreadsExec::verify_is_process( const std::string & name , const bool initialized )
+{
+  if ( ! is_process() ) {
+    std::string msg( name );
+    msg.append( " FAILED : Called by a worker thread, can only be called by the master process." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+  if ( initialized && 0 == s_thread_pool_size[0] ) {
+    std::string msg( name );
+    msg.append( " FAILED : Threads not initialized." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+int ThreadsExec::in_parallel()
+{
+  // A thread function is in execution and
+  // the function argument is not the special threads process argument and
+  // the master process is a worker or is not the master process.
+  return s_current_function &&
+         ( & s_threads_process != s_current_function_arg ) &&
+         ( s_threads_process.m_pool_base || ! is_process() );
+}
+
+// Wait for root thread to become inactive
+void ThreadsExec::fence()
+{
+  if ( s_thread_pool_size[0] ) {
+    // Wait for the root thread to complete:
+    Impl::spinwait_while_equal<int>( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
+  }
+
+  s_current_function     = 0 ;
+  s_current_function_arg = 0 ;
+
+  // Make sure function and arguments are cleared before
+  // potentially re-activating threads with a subsequent launch.
+  memory_fence();
+}
+
+/** \brief  Begin execution of the asynchronous functor */
+void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg )
+{
+  verify_is_process("ThreadsExec::start" , true );
+
+  if ( s_current_function || s_current_function_arg ) {
+    Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) );
+  }
+
+  s_current_function     = func ;
+  s_current_function_arg = arg ;
+
+  // Make sure function and arguments are written before activating threads.
+  memory_fence();
+
+  // Activate threads:
+  for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) {
+    s_threads_exec[i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  if ( s_threads_process.m_pool_size ) {
+    // Master process is the root thread, run it:
+    (*func)( s_threads_process , arg );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::sleep()
+{
+  verify_is_process("ThreadsExec::sleep", true );
+
+  if ( & execute_sleep == s_current_function ) return false ;
+
+  fence();
+
+  ThreadsExec::global_lock();
+
+  s_current_function = & execute_sleep ;
+
+  // Activate threads:
+  for ( unsigned i = s_thread_pool_size[0] ; 0 < i ; ) {
+    s_threads_exec[--i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  return true ;
+}
+
+bool ThreadsExec::wake()
+{
+  verify_is_process("ThreadsExec::wake", true );
+
+  if ( & execute_sleep != s_current_function ) return false ;
+
+  ThreadsExec::global_unlock();
+
+  if ( s_threads_process.m_pool_base ) {
+    execute_sleep( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  fence();
+
+  return true ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
+{
+  s_current_function = func ;
+  s_current_function_arg = & s_threads_process ;
+
+  // Make sure function and arguments are written before activating threads.
+  memory_fence();
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
+    ThreadsExec & th = * s_threads_exec[ --i ];
+
+    th.m_pool_state = ThreadsExec::Active ;
+
+    wait_yield( th.m_pool_state , ThreadsExec::Active );
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    s_threads_process.m_pool_state = ThreadsExec::Active ;
+    (*func)( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  s_current_function_arg = 0 ;
+  s_current_function = 0 ;
+
+  // Make sure function and arguments are cleared before proceeding.
+  memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+void * ThreadsExec::root_reduce_scratch()
+{
+  return s_threads_process.reduce_memory();
+}
+
+void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
+{
+  typedef Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+
+  if ( exec.m_scratch ) {
+    Record * const r = Record::get_record( exec.m_scratch );
+
+    exec.m_scratch = 0 ;
+
+    Record::decrement( r );
+  }
+
+  exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
+  exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
+
+  if ( s_threads_process.m_scratch_thread_end ) {
+
+    // Allocate tracked memory:
+    {
+      Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end );
+
+      Record::increment( r );
+
+      exec.m_scratch = r->data();
+    }
+
+    unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch );
+
+    unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
+
+    // touch on this thread
+    while ( ptr < end ) *ptr++ = 0 ;
+  }
+}
+
+void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
+{
+  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
+
+  fence();
+
+  const size_t old_reduce_size = s_threads_process.m_scratch_reduce_end ;
+  const size_t old_thread_size = s_threads_process.m_scratch_thread_end - s_threads_process.m_scratch_reduce_end ;
+
+  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+
+  // Increase size or deallocate completely.
+
+  if ( ( old_reduce_size < reduce_size ) ||
+       ( old_thread_size < thread_size ) ||
+       ( ( reduce_size == 0 && thread_size == 0 ) &&
+         ( old_reduce_size != 0 || old_thread_size != 0 ) ) ) {
+
+    verify_is_process( "ThreadsExec::resize_scratch" , true );
+
+    s_threads_process.m_scratch_reduce_end = reduce_size ;
+    s_threads_process.m_scratch_thread_end = reduce_size + thread_size ;
+
+    execute_serial( & execute_resize_scratch );
+
+    s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
+  }
+
+  return s_threads_process.m_scratch ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
+{
+  verify_is_process("ThreadsExec::print_configuration",false);
+
+  fence();
+
+  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  // Forestall compiler warnings for unused variables.
+  (void) numa_count;
+  (void) cores_per_numa;
+  (void) threads_per_core;
+
+  s << "Kokkos::Threads" ;
+
+#if defined( KOKKOS_ENABLE_THREADS )
+  s << " KOKKOS_ENABLE_THREADS" ;
+#endif
+#if defined( KOKKOS_ENABLE_HWLOC )
+  s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
+#endif
+
+  if ( s_thread_pool_size[0] ) {
+    s << " threads[" << s_thread_pool_size[0] << "]"
+      << " threads_per_numa[" << s_thread_pool_size[1] << "]"
+      << " threads_per_core[" << s_thread_pool_size[2] << "]"
+      ;
+    if ( 0 == s_threads_process.m_pool_base ) { s << " Asynchronous" ; }
+    s << " ReduceScratch[" << s_current_reduce_size << "]"
+      << " SharedScratch[" << s_current_shared_size << "]" ;
+    s << std::endl ;
+
+    if ( detail ) {
+
+      for ( int i = 0 ; i < s_thread_pool_size[0] ; ++i ) {
+
+        ThreadsExec * const th = s_threads_exec[i] ;
+
+        if ( th ) {
+
+          const int rank_rev = th->m_pool_size - ( th->m_pool_rank + 1 );
+
+          s << " Thread[ " << th->m_pool_rank << " : "
+            << th->m_numa_rank << "." << th->m_numa_core_rank << " ]" ;
+
+          s << " Fan{" ;
+          for ( int j = 0 ; j < th->m_pool_fan_size ; ++j ) {
+            ThreadsExec * const thfan = th->m_pool_base[rank_rev+(1<<j)] ;
+            s << " [ " << thfan->m_pool_rank << " : "
+              << thfan->m_numa_rank << "." << thfan->m_numa_core_rank << " ]" ;
+          }
+          s << " }" ;
+
+          if ( th == & s_threads_process ) {
+            s << " is_process" ;
+          }
+        }
+        s << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+int ThreadsExec::is_initialized()
+{ return 0 != s_threads_exec[0] ; }
+
+void ThreadsExec::initialize( unsigned thread_count ,
+                              unsigned use_numa_count ,
+                              unsigned use_cores_per_numa ,
+                              bool allow_asynchronous_threadpool )
+{
+  static const Sentinel sentinel ;
+
+  const bool is_initialized = 0 != s_thread_pool_size[0] ;
+
+  unsigned thread_spawn_failed = 0 ;
+
+  for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
+    s_threads_exec[i] = NULL;
+
+  if ( ! is_initialized ) {
+
+    // If thread_count, use_numa_count, or use_cores_per_numa are zero
+    // then they will be given default values based upon hwloc detection
+    // and allowed asynchronous execution.
+
+    const bool hwloc_avail = Kokkos::hwloc::available();
+    const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads();
+
+    if ( thread_count == 0 ) {
+      thread_count = hwloc_avail
+      ? Kokkos::hwloc::get_available_numa_count() *
+        Kokkos::hwloc::get_available_cores_per_numa() *
+        Kokkos::hwloc::get_available_threads_per_core()
+      : 1 ;
+    }
+
+    const unsigned thread_spawn_begin =
+      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
+                             allow_asynchronous_threadpool ,
+                             thread_count ,
+                             use_numa_count ,
+                             use_cores_per_numa ,
+                             s_threads_coord );
+
+    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;
+
+    if ( thread_spawn_begin ) {
+      // Synchronous with s_threads_coord[0] as the process core
+      // Claim entry #0 for binding the process core.
+      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
+    }
+
+    s_thread_pool_size[0] = thread_count ;
+    s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
+    s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
+    s_current_function = & execute_function_noop ; // Initialization work function
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+      // If hwloc available then spawned thread will
+      // choose its own entry in 's_threads_coord'
+      // otherwise specify the entry.
+      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_can_bind ? ~0u : ith );
+
+      // Make sure all outstanding memory writes are complete
+      // before spawning the new thread.
+      memory_fence();
+
+      // Spawn thread executing the 'driver()' function.
+      // Wait until spawned thread has attempted to initialize.
+      // If spawning and initialization is successfull then
+      // an entry in 's_threads_exec' will be assigned.
+      if ( ThreadsExec::spawn() ) {
+        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+      }
+      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
+    }
+
+    // Wait for all spawned threads to deactivate before zeroing the function.
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+      // Try to protect against cache coherency failure by casting to volatile.
+      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
+      if ( th ) {
+        wait_yield( th->m_pool_state , ThreadsExec::Active );
+      }
+      else {
+        ++thread_spawn_failed ;
+      }
+    }
+
+    s_current_function     = 0 ;
+    s_current_function_arg = 0 ;
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+    memory_fence();
+
+    if ( ! thread_spawn_failed ) {
+      // Bind process to the core on which it was located before spawning occured
+      if (hwloc_can_bind) {
+        Kokkos::hwloc::bind_this_thread( proc_coord );
+      }
+
+      if ( thread_spawn_begin ) { // Include process in pool.
+        const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+        s_threads_exec[0]                   = & s_threads_process ;
+        s_threads_process.m_numa_rank       = coord.first ;
+        s_threads_process.m_numa_core_rank  = coord.second ;
+        s_threads_process.m_pool_base       = s_threads_exec ;
+        s_threads_process.m_pool_rank       = thread_count - 1 ; // Reversed for scan-compatible reductions
+        s_threads_process.m_pool_size       = thread_count ;
+        s_threads_process.m_pool_fan_size   = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
+        s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self();
+      }
+      else {
+        s_threads_process.m_pool_base = 0 ;
+        s_threads_process.m_pool_rank = 0 ;
+        s_threads_process.m_pool_size = 0 ;
+        s_threads_process.m_pool_fan_size = 0 ;
+      }
+
+      // Initial allocations:
+      ThreadsExec::resize_scratch( 1024 , 1024 );
+    }
+    else {
+      s_thread_pool_size[0] = 0 ;
+      s_thread_pool_size[1] = 0 ;
+      s_thread_pool_size[2] = 0 ;
+    }
+  }
+
+  if ( is_initialized || thread_spawn_failed ) {
+
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Threads::initialize ERROR" ;
+
+    if ( is_initialized ) {
+      msg << " : already initialized" ;
+    }
+    if ( thread_spawn_failed ) {
+      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  // Check for over-subscription
+  if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
+    std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
+    std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
+    std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
+    std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
+  }
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+  Impl::SharedAllocationRecord< void, void >::tracking_enable();
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::finalize()
+{
+  verify_is_process("ThreadsExec::finalize",false);
+
+  fence();
+
+  resize_scratch(0,0);
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_thread_pool_size[0] ; begin < i-- ; ) {
+
+    if ( s_threads_exec[i] ) {
+
+      s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ;
+
+      wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+    }
+
+    s_threads_pid[i] = 0 ;
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    ( & s_threads_process )->~ThreadsExec();
+    s_threads_exec[0] = 0 ;
+  }
+
+  if (Kokkos::hwloc::can_bind_threads() ) {
+    Kokkos::hwloc::unbind_this_thread();
+  }
+
+  s_thread_pool_size[0] = 0 ;
+  s_thread_pool_size[1] = 0 ;
+  s_thread_pool_size[2] = 0 ;
+
+  // Reset master thread to run solo.
+  s_threads_process.m_numa_rank       = 0 ;
+  s_threads_process.m_numa_core_rank  = 0 ;
+  s_threads_process.m_pool_base       = 0 ;
+  s_threads_process.m_pool_rank       = 0 ;
+  s_threads_process.m_pool_size       = 1 ;
+  s_threads_process.m_pool_fan_size   = 0 ;
+  s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+int Threads::concurrency() {
+  return thread_pool_size(0);
+}
+
+Threads & Threads::instance(int)
+{
+  static Threads t ;
+  return t ;
+}
+
+int Threads::thread_pool_size( int depth )
+{
+  return Impl::s_thread_pool_size[depth];
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+int Threads::thread_pool_rank()
+{
+  const pthread_t pid = pthread_self();
+  int i = 0;
+  while ( ( i < Impl::s_thread_pool_size[0] ) && ( pid != Impl::s_threads_pid[i] ) ) { ++i ; }
+  return i ;
+}
+#endif
+
+const char* Threads::name() { return "Threads"; }
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+#else
+void KOKKOS_CORE_SRC_THREADS_EXEC_PREVENT_LINK_ERROR() {}
+#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
+
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6f5c461f3e6b3ac26a813786bf834104bacdde8d
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -0,0 +1,702 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADSEXEC_HPP
+#define KOKKOS_THREADSEXEC_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_THREADS )
+
+#include <cstdio>
+
+#include <utility>
+#include <impl/Kokkos_Spinwait.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+#include <Kokkos_UniqueToken.hpp>
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class ThreadsExec {
+public:
+
+  // Fan array has log_2(NT) reduction threads plus 2 scan threads
+  // Currently limited to 16k threads.
+  enum { MAX_FAN_COUNT    = 16 };
+  enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) };
+  enum { VECTOR_LENGTH    = 8 };
+
+  /** \brief States of a worker thread */
+  enum { Terminating ///<  Termination in progress
+       , Inactive    ///<  Exists, waiting for work
+       , Active      ///<  Exists, performing work
+       , Rendezvous  ///<  Exists, waiting in a barrier or reduce
+
+       , ScanCompleted
+       , ScanAvailable
+       , ReductionAvailable
+       };
+
+private:
+
+  friend class Kokkos::Threads ;
+
+  // Fan-in operations' root is the highest ranking thread
+  // to place the 'scan' reduction intermediate values on
+  // the threads that need them.
+  // For a simple reduction the thread location is arbitrary.
+
+  ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
+
+  void *        m_scratch ;
+  int           m_scratch_reduce_end ;
+  int           m_scratch_thread_end ;
+  int           m_numa_rank ;
+  int           m_numa_core_rank ;
+  int           m_pool_rank ;
+  int           m_pool_rank_rev ;
+  int           m_pool_size ;
+  int           m_pool_fan_size ;
+  int volatile  m_pool_state ;  ///< State for global synchronizations
+
+  // Members for dynamic scheduling
+  // Which thread am I stealing from currently
+  int m_current_steal_target;
+  // This thread's owned work_range
+  Kokkos::pair<long,long> m_work_range __attribute__((aligned(16))) ;
+  // Team Offset if one thread determines work_range for others
+  long m_team_work_index;
+
+  // Is this thread stealing (i.e. its owned work_range is exhausted
+  bool m_stealing;
+
+  static void global_lock();
+  static void global_unlock();
+  static bool spawn();
+
+  static void execute_resize_scratch( ThreadsExec & , const void * );
+  static void execute_sleep(          ThreadsExec & , const void * );
+
+  ThreadsExec( const ThreadsExec & );
+  ThreadsExec & operator = ( const ThreadsExec & );
+
+  static void execute_serial( void (*)( ThreadsExec & , const void * ) );
+
+public:
+
+  KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size ; }
+  KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank ; }
+  KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank ; }
+  KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank ; }
+  inline long team_work_index() const { return m_team_work_index ; }
+
+  static int get_thread_count();
+  static ThreadsExec * get_thread( const int init_thread_rank );
+
+  inline void * reduce_memory() const { return m_scratch ; }
+  KOKKOS_INLINE_FUNCTION  void * scratch_memory() const
+    { return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; }
+
+  KOKKOS_INLINE_FUNCTION  int volatile & state() { return m_pool_state ; }
+  KOKKOS_INLINE_FUNCTION  ThreadsExec * const * pool_base() const { return m_pool_base ; }
+
+  static void driver(void);
+
+  ~ThreadsExec();
+  ThreadsExec();
+
+  static void * resize_scratch( size_t reduce_size , size_t thread_size );
+
+  static void * root_reduce_scratch();
+
+  static bool is_process();
+
+  static void verify_is_process( const std::string & , const bool initialized );
+
+  static int is_initialized();
+
+  static void initialize( unsigned thread_count ,
+                          unsigned use_numa_count ,
+                          unsigned use_cores_per_numa ,
+                          bool allow_asynchronous_threadpool );
+
+  static void finalize();
+
+  /* Given a requested team size, return valid team size */
+  static unsigned team_size_valid( unsigned );
+
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //------------------------------------
+
+  static void wait_yield( volatile int & , const int );
+
+  //------------------------------------
+  // All-thread functions:
+
+  inline
+  int all_reduce( const int value )
+    {
+      // Make sure there is enough scratch space:
+      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
+
+      *((volatile int*) reduce_memory()) = value ;
+
+      memory_fence();
+
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the reduction and broadcast
+
+        int accum = 0 ;
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          accum += *((volatile int *) get_thread( rank )->reduce_memory());
+        }
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          *((volatile int *) get_thread( rank )->reduce_memory()) = accum ;
+        }
+
+        memory_fence();
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          get_thread( rank )->m_pool_state = ThreadsExec::Active ;
+        }
+      }
+
+      return *((volatile int*) reduce_memory());
+    }
+
+  inline
+  void barrier( )
+    {
+      // Make sure there is enough scratch space:
+      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
+
+      memory_fence();
+
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the reduction and broadcast
+
+        memory_fence();
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          get_thread( rank )->m_pool_state = ThreadsExec::Active ;
+        }
+      }
+    }
+
+  //------------------------------------
+  // All-thread functions:
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void fan_in_reduce( const FunctorType & f ) const
+    {
+      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorFinal<     FunctorType , ArgTag > Final ;
+
+      const int rev_rank  = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+
+        ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
+
+        Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );
+
+        Join::join( f , reduce_memory() , fan.reduce_memory() );
+      }
+
+      if ( ! rev_rank ) {
+        Final::final( f , reduce_memory() );
+      }
+
+      //  This thread has updated 'reduce_memory()' and upon returning
+      //  from this function will set 'm_pool_state' to inactive.
+      //  If this is a non-root thread then setting 'm_pool_state'
+      //  to inactive triggers another thread to exit a spinwait
+      //  and read the 'reduce_memory'.
+      //  Must 'memory_fence()' to guarantee that storing the update to
+      //  'reduce_memory()' will complete before storing the the update to
+      //  'm_pool_state'.
+
+      memory_fence();
+    }
+
+  inline
+  void fan_in() const
+    {
+      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait_while_equal<int>( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
+      }
+    }
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void scan_large( const FunctorType & f )
+    {
+      // Sequence of states:
+      //  0) Active             : entry and exit state
+      //  1) ReductionAvailable : reduction value available
+      //  2) ScanAvailable      : inclusive scan value available
+      //  3) Rendezvous         : All threads inclusive scan value are available
+      //  4) ScanCompleted      : exclusive scan value copied
+
+      typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
+      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > Init ;
+
+      typedef typename Traits::value_type scalar_type ;
+
+      const int      rev_rank = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Traits::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_memory();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
+
+        // Wait: Active -> ReductionAvailable (or ScanAvailable)
+        Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );
+        Join::join( f , work_value , fan.reduce_memory() );
+      }
+
+      // Copy reduction value to scan value before releasing from this phase.
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i] ; }
+
+      if ( rev_rank ) {
+
+        // Set: Active -> ReductionAvailable
+        m_pool_state = ThreadsExec::ReductionAvailable ;
+
+        // Wait for contributing threads' scan value to be available.
+        if ( ( 1 << m_pool_fan_size ) < ( m_pool_rank + 1 ) ) {
+          ThreadsExec & th = *m_pool_base[ rev_rank + ( 1 << m_pool_fan_size ) ] ;
+
+          // Wait: Active             -> ReductionAvailable
+          // Wait: ReductionAvailable -> ScanAvailable
+          Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::Active );
+          Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::ReductionAvailable );
+
+          Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
+        }
+
+        // This thread has completed inclusive scan
+        // Set: ReductionAvailable -> ScanAvailable
+        m_pool_state = ThreadsExec::ScanAvailable ;
+
+        // Wait for all threads to complete inclusive scan
+        // Wait: ScanAvailable -> Rendezvous
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanAvailable );
+      }
+
+      //--------------------------------
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
+        // Wait: ReductionAvailable -> ScanAvailable
+        Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::ReductionAvailable );
+        // Set: ScanAvailable -> Rendezvous
+        fan.m_pool_state = ThreadsExec::Rendezvous ;
+      }
+
+      // All threads have completed the inclusive scan.
+      // All non-root threads are in the Rendezvous state.
+      // Threads are free to overwrite their reduction value.
+      //--------------------------------
+
+      if ( ( rev_rank + 1 ) < m_pool_size ) {
+        // Exclusive scan: copy the previous thread's inclusive scan value
+
+        ThreadsExec & th = *m_pool_base[ rev_rank + 1 ] ; // Not the root thread
+
+        const scalar_type * const src_value = ((scalar_type *)th.reduce_memory()) + count ;
+
+        for ( unsigned j = 0 ; j < count ; ++j ) { work_value[j] = src_value[j]; }
+      }
+      else {
+        (void) Init::init( f , work_value );
+      }
+
+      //--------------------------------
+      // Wait for all threads to copy previous thread's inclusive scan value
+      // Wait for all threads: Rendezvous -> ScanCompleted
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
+      }
+      if ( rev_rank ) {
+        // Set: ScanAvailable -> ScanCompleted
+        m_pool_state = ThreadsExec::ScanCompleted ;
+        // Wait: ScanCompleted -> Active
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanCompleted );
+      }
+      // Set: ScanCompleted -> Active
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void scan_small( const FunctorType & f )
+    {
+      typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
+      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > Init ;
+
+      typedef typename Traits::value_type scalar_type ;
+
+      const int      rev_rank = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Traits::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_memory();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
+
+      if ( rev_rank ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the thread-scan before releasing threads
+
+        scalar_type * ptr_prev = 0 ;
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          scalar_type * const ptr = (scalar_type *) get_thread( rank )->reduce_memory();
+          if ( rank ) {
+            for ( unsigned i = 0 ; i < count ; ++i ) { ptr[i] = ptr_prev[ i + count ]; }
+            Join::join( f , ptr + count , ptr );
+          }
+          else {
+            (void) Init::init( f , ptr );
+          }
+          ptr_prev = ptr ;
+        }
+      }
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  //------------------------------------
+  /** \brief  Wait for previous asynchronous functor to
+   *          complete and release the Threads device.
+   *          Acquire the Threads device and start this functor.
+   */
+  static void start( void (*)( ThreadsExec & , const void * ) , const void * );
+
+  static int  in_parallel();
+  static void fence();
+  static bool sleep();
+  static bool wake();
+
+  /* Dynamic Scheduling related functionality */
+  // Initialize the work range for this thread
+  inline void set_work_range(const long& begin, const long& end, const long& chunk_size) {
+    m_work_range.first = (begin+chunk_size-1)/chunk_size;
+    m_work_range.second = end>0?(end+chunk_size-1)/chunk_size:m_work_range.first;
+  }
+
+  // Claim and index from this thread's range from the beginning
+  inline long get_work_index_begin () {
+    Kokkos::pair<long,long> work_range_new = m_work_range;
+    Kokkos::pair<long,long> work_range_old = work_range_new;
+    if(work_range_old.first>=work_range_old.second)
+      return -1;
+
+    work_range_new.first+=1;
+
+    bool success = false;
+    while(!success) {
+      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
+      success = ( (work_range_new == work_range_old) ||
+                  (work_range_new.first>=work_range_new.second));
+      work_range_old = work_range_new;
+      work_range_new.first+=1;
+    }
+    if(work_range_old.first<work_range_old.second)
+      return work_range_old.first;
+    else
+      return -1;
+  }
+
+  // Claim and index from this thread's range from the end
+  inline long get_work_index_end () {
+    Kokkos::pair<long,long> work_range_new = m_work_range;
+    Kokkos::pair<long,long> work_range_old = work_range_new;
+    if(work_range_old.first>=work_range_old.second)
+      return -1;
+    work_range_new.second-=1;
+    bool success = false;
+    while(!success) {
+      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
+      success = ( (work_range_new == work_range_old) ||
+                  (work_range_new.first>=work_range_new.second) );
+      work_range_old = work_range_new;
+      work_range_new.second-=1;
+    }
+    if(work_range_old.first<work_range_old.second)
+      return work_range_old.second-1;
+    else
+      return -1;
+  }
+
+  // Reset the steal target
+  inline void reset_steal_target() {
+    m_current_steal_target = (m_pool_rank+1)%pool_size();
+    m_stealing = false;
+  }
+
+  // Reset the steal target
+  inline void reset_steal_target(int team_size) {
+    m_current_steal_target = (m_pool_rank_rev+team_size);
+    if(m_current_steal_target>=pool_size())
+      m_current_steal_target = 0;//pool_size()-1;
+    m_stealing = false;
+  }
+
+  // Get a steal target; start with my-rank + 1 and go round robin, until arriving at this threads rank
+  // Returns -1 fi no active steal target available
+  inline int get_steal_target() {
+    while(( m_pool_base[m_current_steal_target]->m_work_range.second <=
+            m_pool_base[m_current_steal_target]->m_work_range.first  ) &&
+          (m_current_steal_target!=m_pool_rank) ) {
+      m_current_steal_target = (m_current_steal_target+1)%pool_size();
+    }
+    if(m_current_steal_target == m_pool_rank)
+      return -1;
+    else
+      return m_current_steal_target;
+  }
+
+  inline int get_steal_target(int team_size) {
+
+    while(( m_pool_base[m_current_steal_target]->m_work_range.second <=
+            m_pool_base[m_current_steal_target]->m_work_range.first  ) &&
+          (m_current_steal_target!=m_pool_rank_rev) ) {
+      if(m_current_steal_target + team_size < pool_size())
+        m_current_steal_target = (m_current_steal_target+team_size);
+      else
+        m_current_steal_target = 0;
+    }
+
+    if(m_current_steal_target == m_pool_rank_rev)
+      return -1;
+    else
+      return m_current_steal_target;
+  }
+
+  inline long steal_work_index (int team_size = 0) {
+    long index = -1;
+    int steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
+    while ( (steal_target != -1) && (index == -1)) {
+      index = m_pool_base[steal_target]->get_work_index_end();
+      if(index == -1)
+        steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
+    }
+    return index;
+  }
+
+  // Get a work index. Claim from owned range until its exhausted, then steal from other thread
+  inline long get_work_index (int team_size = 0) {
+    long work_index = -1;
+    if(!m_stealing) work_index = get_work_index_begin();
+
+    if( work_index == -1) {
+      memory_fence();
+      m_stealing = true;
+      work_index = steal_work_index(team_size);
+    }
+
+    m_team_work_index = work_index;
+    memory_fence();
+    return work_index;
+  }
+
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline int Threads::in_parallel()
+{ return Impl::ThreadsExec::in_parallel(); }
+
+inline int Threads::is_initialized()
+{ return Impl::ThreadsExec::is_initialized(); }
+
+inline void Threads::initialize(
+  unsigned threads_count ,
+  unsigned use_numa_count ,
+  unsigned use_cores_per_numa ,
+  bool allow_asynchronous_threadpool )
+{
+  Impl::ThreadsExec::initialize( threads_count , use_numa_count , use_cores_per_numa , allow_asynchronous_threadpool );
+}
+
+inline void Threads::finalize()
+{
+  Impl::ThreadsExec::finalize();
+}
+
+inline void Threads::print_configuration( std::ostream & s , const bool detail )
+{
+  Impl::ThreadsExec::print_configuration( s , detail );
+}
+
+inline bool Threads::sleep()
+{ return Impl::ThreadsExec::sleep() ; }
+
+inline bool Threads::wake()
+{ return Impl::ThreadsExec::wake() ; }
+
+inline void Threads::fence()
+{ Impl::ThreadsExec::fence() ; }
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Experimental {
+
+template<>
+class UniqueToken< Threads, UniqueTokenScope::Instance>
+{
+public:
+  using execution_space = Threads;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return Threads::thread_pool_size(); }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return Threads::thread_pool_rank(); }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+template<>
+class UniqueToken< Threads, UniqueTokenScope::Global>
+{
+public:
+  using execution_space = Threads;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return Threads::thread_pool_size(); }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return Threads::thread_pool_rank(); }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+}} // namespace Kokkos::Experimental
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+#endif
+#endif /* #define KOKKOS_THREADSEXEC_HPP */
+
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bac6022cd487eaa02134732a24d21268b1c31205
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@@ -0,0 +1,253 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_PTHREAD )
+
+#include <Kokkos_Core_fwd.hpp>
+/* Standard 'C' Linux libraries */
+
+#include <pthread.h>
+#include <sched.h>
+#include <errno.h>
+
+/* Standard C++ libaries */
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Threads.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+pthread_mutex_t host_internal_pthread_mutex = PTHREAD_MUTEX_INITIALIZER ;
+
+// Pthreads compatible driver.
+// Recovery from an exception would require constant intra-thread health
+// verification; which would negatively impact runtime.  As such simply
+// abort the process.
+
+void * internal_pthread_driver( void * )
+{
+  try {
+    ThreadsExec::driver();
+  }
+  catch( const std::exception & x ) {
+    std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  catch( ... ) {
+    std::cerr << "Exception thrown from worker thread" << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  return NULL ;
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+// Spawn a thread
+
+bool ThreadsExec::spawn()
+{
+  bool result = false ;
+
+  pthread_attr_t attr ;
+
+  if ( 0 == pthread_attr_init( & attr ) ||
+       0 == pthread_attr_setscope(       & attr, PTHREAD_SCOPE_SYSTEM ) ||
+       0 == pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) {
+
+    pthread_t pt ;
+
+    result = 0 == pthread_create( & pt, & attr, internal_pthread_driver, 0 );
+  }
+
+  pthread_attr_destroy( & attr );
+
+  return result ;
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::is_process()
+{
+  static const pthread_t master_pid = pthread_self();
+
+  return pthread_equal( master_pid , pthread_self() );
+}
+
+void ThreadsExec::global_lock()
+{
+  pthread_mutex_lock( & host_internal_pthread_mutex );
+}
+
+void ThreadsExec::global_unlock()
+{
+  pthread_mutex_unlock( & host_internal_pthread_mutex );
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value )
+{
+  while ( value == flag ) { sched_yield(); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/* end #if defined( KOKKOS_ENABLE_PTHREAD ) */
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ENABLE_WINTHREAD )
+
+#include <Kokkos_Core_fwd.hpp>
+
+/* Windows libraries */
+#include <winsock2.h>
+#include <windows.h>
+#include <process.h>
+
+/* Standard C++ libaries */
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Threads.hpp>
+
+//----------------------------------------------------------------------------
+// Driver for each created pthread
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+unsigned WINAPI internal_winthread_driver( void * arg )
+{
+  ThreadsExec::driver();
+
+  return 0 ;
+}
+
+class ThreadLockWindows {
+private:
+  CRITICAL_SECTION  m_handle ;
+
+  ~ThreadLockWindows()
+  { DeleteCriticalSection( & m_handle ); }
+
+  ThreadLockWindows();
+  { InitializeCriticalSection( & m_handle ); }
+
+  ThreadLockWindows( const ThreadLockWindows & );
+  ThreadLockWindows & operator = ( const ThreadLockWindows & );
+
+public:
+
+  static ThreadLockWindows & singleton();
+
+  void lock()
+  { EnterCriticalSection( & m_handle ); }
+
+  void unlock()
+  { LeaveCriticalSection( & m_handle ); }
+};
+
+ThreadLockWindows & ThreadLockWindows::singleton()
+{ static ThreadLockWindows self ; return self ; }
+
+} // namespace <>
+} // namespace Kokkos
+} // namespace Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Spawn this thread
+
+bool ThreadsExec::spawn()
+{
+  unsigned Win32ThreadID = 0 ;
+
+  HANDLE handle =
+    _beginthreadex(0,0,internal_winthread_driver,0,0, & Win32ThreadID );
+
+  return ! handle ;
+}
+
+bool ThreadsExec::is_process() { return true ; }
+
+void ThreadsExec::global_lock()
+{ ThreadLockWindows::singleton().lock(); }
+
+void ThreadsExec::global_unlock()
+{ ThreadLockWindows::singleton().unlock(); }
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
+{
+  while ( value == flag ) { Sleep(0); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#else
+void KOKKOS_CORE_SRC_THREADS_EXEC_BASE_PREVENT_LINK_ERROR() {}
+#endif /* end #elif defined( KOKKOS_ENABLE_WINTHREAD ) */
+
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a496d911d21668ace8555e8abab079168658bd48
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@@ -0,0 +1,1029 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADSTEAM_HPP
+#define KOKKOS_THREADSTEAM_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_THREADS )
+
+#include <cstdio>
+
+#include <utility>
+#include <impl/Kokkos_Spinwait.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class > struct ThreadsExecAdapter ;
+
+//----------------------------------------------------------------------------
+
+class ThreadsExecTeamMember {
+private:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  typedef Kokkos::Threads execution_space ;
+  typedef execution_space::scratch_memory_space space ;
+
+  ThreadsExec * const   m_exec ;
+  ThreadsExec * const * m_team_base ; ///< Base for team fan-in
+  space                 m_team_shared ;
+  int                   m_team_shared_size ;
+  int                   m_team_size ;
+  int                   m_team_rank ;
+  int                   m_team_rank_rev ;
+  int                   m_league_size ;
+  int                   m_league_end ;
+  int                   m_league_rank ;
+
+  int                   m_chunk_size;
+  int                   m_league_chunk_end;
+
+  int                   m_invalid_thread;
+  int                   m_team_alloc;
+
+  inline
+  void set_team_shared()
+    { new( & m_team_shared ) space( ((char *) (*m_team_base)->scratch_memory()) + TEAM_REDUCE_SIZE , m_team_shared_size ); }
+
+public:
+
+  // Fan-in and wait until the matching fan-out is called.
+  // The root thread which does not wait will return true.
+  // All other threads will return false during the fan-out.
+  KOKKOS_INLINE_FUNCTION bool team_fan_in() const
+    {
+      int n , j ;
+
+      // Wait for fan-in threads
+      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
+        Impl::spinwait_while_equal<int>( m_team_base[j]->state() , ThreadsExec::Active );
+      }
+
+      // If not root then wait for release
+      if ( m_team_rank_rev ) {
+        m_exec->state() = ThreadsExec::Rendezvous ;
+        Impl::spinwait_while_equal<int>( m_exec->state() , ThreadsExec::Rendezvous );
+      }
+
+      return ! m_team_rank_rev ;
+    }
+
+  KOKKOS_INLINE_FUNCTION void team_fan_out() const
+    {
+      int n , j ;
+      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
+        m_team_base[j]->state() = ThreadsExec::Active ;
+      }
+    }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION static int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & team_scratch(int) const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & thread_scratch(int) const
+    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+    {
+      team_fan_in();
+      team_fan_out();
+    }
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(ValueType& value, const int& thread_id) const
+  {
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { }
+#else
+    // Make sure there is enough scratch space:
+    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
+                         , ValueType , void >::type type ;
+
+    if ( m_team_base ) {
+      type * const local_value = ((type*) m_team_base[0]->scratch_memory());
+      if(team_rank() == thread_id) *local_value = value;
+      memory_fence();
+      team_barrier();
+      value = *local_value;
+    }
+#endif
+  }
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !Kokkos::is_reducer< Type >::value , Type>::type
+  team_reduce( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(Type) < TEAM_REDUCE_SIZE , Type , void >::type type ;
+
+      if ( 0 == m_exec ) return value ;
+
+      *((volatile type*) m_exec->scratch_memory() ) = value ;
+
+      memory_fence();
+
+      type & accum = *((type *) m_team_base[0]->scratch_memory() );
+
+      if ( team_fan_in() ) {
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          accum += *((type *) m_team_base[i]->scratch_memory() );
+        }
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return accum ;
+    }
+#endif
+
+    template< typename ReducerType >
+    KOKKOS_INLINE_FUNCTION
+    typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    team_reduce( const ReducerType & ) const
+      {}
+  #else
+    team_reduce( const ReducerType & reducer ) const
+    {
+      typedef typename ReducerType::value_type value_type;
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
+                           , value_type , void >::type type ;
+
+      if ( 0 == m_exec ) return ;
+
+      type * const local_value = ((type*) m_exec->scratch_memory());
+
+      // Set this thread's contribution
+      *local_value = reducer.reference() ;
+
+      // Fence to make sure the base team member has access:
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        type * const team_value = ((type*) m_team_base[0]->scratch_memory());
+
+        // Join to the team value:
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          reducer.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) );
+        }
+
+        // Team base thread may "lap" member threads so copy out to their local value.
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          *((type*) m_team_base[i]->scratch_memory()) = *team_value ;
+        }
+
+        // Fence to make sure all team members have access
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      // Value was changed by the team base
+      reducer.reference() = *((type volatile const *) local_value);
+    }
+  #endif
+
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION ValueType
+    team_reduce( const ValueType & value
+               , const JoinOp & op_in ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ValueType(); }
+  #else
+    {
+      typedef ValueType value_type;
+      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
+  #endif
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
+                           , value_type , void >::type type ;
+
+      if ( 0 == m_exec ) return value ;
+
+      type * const local_value = ((type*) m_exec->scratch_memory());
+
+      // Set this thread's contribution
+      *local_value = value ;
+
+      // Fence to make sure the base team member has access:
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        type * const team_value = ((type*) m_team_base[0]->scratch_memory());
+
+        // Join to the team value:
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          op.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) );
+        }
+
+        // Team base thread may "lap" member threads so copy out to their local value.
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          *((type*) m_team_base[i]->scratch_memory()) = *team_value ;
+        }
+
+        // Fence to make sure all team members have access
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      // Value was changed by the team base
+      return *((type volatile const *) local_value);
+    }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ArgType(); }
+#else
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
+
+      if ( 0 == m_exec ) return type(0);
+
+      volatile type * const work_value  = ((type*) m_exec->scratch_memory());
+
+      *work_value = value ;
+
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_team_base[i]->scratch_memory());
+            accum += val ;
+          }
+          accum = atomic_fetch_add( global_accum , accum );
+        }
+
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_team_base[i]->scratch_memory());
+          const type offset = accum ;
+          accum += val ;
+          val = offset ;
+        }
+
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *work_value ;
+    }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value ) const
+    { return this-> template team_scan<ArgType>( value , 0 ); }
+
+
+  //----------------------------------------
+  // Private for the driver
+
+  template< class ... Properties >
+  ThreadsExecTeamMember( Impl::ThreadsExec * exec
+                       , const TeamPolicyInternal< Kokkos::Threads , Properties ... > & team
+                       , const int shared_size )
+    : m_exec( exec )
+    , m_team_base(0)
+    , m_team_shared(0,0)
+    , m_team_shared_size( shared_size )
+    , m_team_size(team.team_size())
+    , m_team_rank(0)
+    , m_team_rank_rev(0)
+    , m_league_size(0)
+    , m_league_end(0)
+    , m_league_rank(0)
+    , m_chunk_size( team.chunk_size() )
+    , m_league_chunk_end(0)
+    , m_team_alloc( team.team_alloc())
+   {
+      if ( team.league_size() ) {
+        // Execution is using device-team interface:
+
+        const int pool_rank_rev = m_exec->pool_size() - ( m_exec->pool_rank() + 1 );
+        const int team_rank_rev = pool_rank_rev % team.team_alloc();
+        const size_t pool_league_size     = m_exec->pool_size() / team.team_alloc() ;
+        const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
+        if(pool_league_rank_rev >= pool_league_size) {
+          m_invalid_thread = 1;
+          return;
+        }
+        const size_t pool_league_rank     = pool_league_size - ( pool_league_rank_rev + 1 );
+
+        const int pool_num_teams       = m_exec->pool_size()/team.team_alloc();
+        const int chunk_size           = team.chunk_size()>0?team.chunk_size():team.team_iter();
+        const int chunks_per_team      = ( team.league_size() + chunk_size*pool_num_teams-1 ) / (chunk_size*pool_num_teams);
+              int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * chunk_size;
+              int league_iter_begin    = league_iter_end - chunks_per_team * chunk_size;
+        if (league_iter_begin < 0)     league_iter_begin = 0;
+        if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
+
+        if ((team.team_alloc()>m_team_size)?
+            (team_rank_rev >= m_team_size):
+            (m_exec->pool_size() - pool_num_teams*m_team_size > m_exec->pool_rank())
+           )
+          m_invalid_thread = 1;
+        else
+          m_invalid_thread = 0;
+
+        // May be using fewer threads per team than a multiple of threads per core,
+        // some threads will idle.
+
+        if ( team_rank_rev < team.team_size() && !m_invalid_thread) {
+
+          m_team_base        = m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev ;
+          m_team_size        = team.team_size() ;
+          m_team_rank        = team.team_size() - ( team_rank_rev + 1 );
+          m_team_rank_rev    = team_rank_rev ;
+          m_league_size      = team.league_size();
+
+          m_league_rank      = ( team.league_size() *  pool_league_rank    ) / pool_league_size ;
+          m_league_end       = ( team.league_size() * (pool_league_rank+1) ) / pool_league_size ;
+
+          set_team_shared();
+        }
+
+        if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
+          m_exec->set_work_range(m_league_rank,m_league_end,m_chunk_size);
+          m_exec->reset_steal_target(m_team_size);
+        }
+        if(std::is_same<typename TeamPolicyInternal<Kokkos::Threads, Properties ...>::schedule_type::type,Kokkos::Dynamic>::value) {
+          m_exec->barrier();
+        }
+      }
+      else
+      { m_invalid_thread = 1; }
+    }
+
+  ThreadsExecTeamMember()
+    : m_exec(0)
+    , m_team_base(0)
+    , m_team_shared(0,0)
+    , m_team_shared_size(0)
+    , m_team_size(1)
+    , m_team_rank(0)
+    , m_team_rank_rev(0)
+    , m_league_size(1)
+    , m_league_end(0)
+    , m_league_rank(0)
+    , m_chunk_size(0)
+    , m_league_chunk_end(0)
+    , m_invalid_thread(0)
+    , m_team_alloc(0)
+    {}
+
+  inline
+  ThreadsExec & threads_exec_team_base() const { return m_team_base ? **m_team_base : *m_exec ; }
+
+  bool valid_static() const
+    { return m_league_rank < m_league_end ; }
+
+  void next_static()
+    {
+      if ( m_league_rank < m_league_end ) {
+        // Make sure all stores are complete before entering the barrier
+        memory_fence();
+        team_barrier();
+        set_team_shared();
+      }
+      m_league_rank++;
+    }
+
+  bool valid_dynamic() {
+
+    if(m_invalid_thread)
+      return false;
+    if ((m_league_rank < m_league_chunk_end) && (m_league_rank < m_league_size)) {
+      return true;
+    }
+
+    if (  m_team_rank_rev == 0 ) {
+      m_team_base[0]->get_work_index(m_team_alloc);
+    }
+    team_barrier();
+
+    long work_index = m_team_base[0]->team_work_index();
+
+    m_league_rank = work_index * m_chunk_size;
+    m_league_chunk_end = (work_index +1 ) * m_chunk_size;
+
+    if(m_league_chunk_end > m_league_size) m_league_chunk_end = m_league_size;
+
+    if((m_league_rank>=0) && (m_league_rank < m_league_chunk_end))
+      return true;
+    return false;
+  }
+
+  void next_dynamic() {
+    if(m_invalid_thread)
+      return;
+
+    if ( m_league_rank < m_league_chunk_end ) {
+      // Make sure all stores are complete before entering the barrier
+      memory_fence();
+      team_barrier();
+      set_team_shared();
+    }
+    m_league_rank++;
+  }
+
+  void set_league_shmem( const int arg_league_rank
+                       , const int arg_league_size
+                       , const int arg_shmem_size
+                       )
+    {
+      m_league_rank = arg_league_rank ;
+      m_league_size = arg_league_size ;
+      m_team_shared_size = arg_shmem_size ;
+      set_team_shared();
+    }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Threads , Properties ... >: public PolicyTraits<Properties ...>
+{
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+  int m_team_iter ;
+
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+
+  int m_chunk_size;
+
+  inline
+  void init( const int league_size_request
+           , const int team_size_request )
+   {
+      const int pool_size  = traits::execution_space::thread_pool_size(0);
+      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
+      const int team_grain = traits::execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+
+      // Maxumum number of iterations each team will take:
+      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
+
+      set_auto_chunk_size();
+   }
+
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_size = p.m_team_size;
+    m_team_alloc = p.m_team_alloc;
+    m_team_iter = p.m_team_iter;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & ) {
+      int pool_size = traits::execution_space::thread_pool_size(1);
+      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      return pool_size<max_host_team_size?pool_size:max_host_team_size;
+    }
+
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & )
+    { return traits::execution_space::thread_pool_size(2); }
+
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& )
+    { return traits::execution_space::thread_pool_size(2); }
+
+  //----------------------------------------
+
+  inline int team_size() const { return m_team_size ; }
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int league_size() const { return m_league_size ; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1 ) const {
+    if(team_size_ < 0)
+      team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }
+
+  inline int team_iter() const { return m_team_iter ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , int team_size_request
+            , int vector_length_request = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_chunk_size(0)
+    { init(league_size_request,team_size_request); (void) vector_length_request; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_chunk_size(0)
+    { init(league_size_request,traits::execution_space::thread_pool_size(2)); }
+
+  TeamPolicyInternal( int league_size_request
+            , int team_size_request
+            , int /* vector_length_request */ = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_chunk_size(0)
+    { init(league_size_request,team_size_request); }
+
+  TeamPolicyInternal( int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_chunk_size(0)
+    { init(league_size_request,traits::execution_space::thread_pool_size(2)); }
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+protected:
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal internal_set_chunk_size(typename traits::index_type chunk_size_) {
+    m_chunk_size = chunk_size_;
+    return *this;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team) {
+    m_team_scratch_size[level] = per_team.value;
+    return *this;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerThreadValue& per_thread) {
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal internal_set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) {
+    m_team_scratch_size[level] = per_team.value;
+    m_thread_scratch_size[level] = per_thread.value;
+    return *this;
+  };
+
+private:
+  /** \brief finalize chunk_size if it was set to AUTO*/
+  inline void set_auto_chunk_size() {
+
+    int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
+    if( concurrency==0 ) concurrency=1;
+
+    if(m_chunk_size > 0) {
+      if(!Impl::is_integral_power_of_two( m_chunk_size ))
+        Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
+    }
+
+    int new_chunk_size = 1;
+    while(new_chunk_size*100*concurrency < m_league_size)
+      new_chunk_size *= 2;
+    if(new_chunk_size < 128) {
+      new_chunk_size = 1;
+      while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
+        new_chunk_size*=2;
+    }
+    m_chunk_size = new_chunk_size;
+  }
+
+public:
+
+  typedef Impl::ThreadsExecTeamMember member_type ;
+
+  friend class Impl::ThreadsExecTeamMember ;
+};
+
+} /*namespace Impl */
+} /* namespace Kokkos */
+
+
+namespace Kokkos {
+
+template< typename iType >
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember >
+TeamThreadRange( const Impl::ThreadsExecTeamMember& thread, const iType& count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember >( thread, count );
+}
+
+template< typename iType1, typename iType2 >
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
+                                       Impl::ThreadsExecTeamMember>
+TeamThreadRange( const Impl::ThreadsExecTeamMember& thread, const iType1 & begin, const iType2 & end )
+{
+  typedef typename std::common_type< iType1, iType2 >::type iType;
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember >( thread, iType(begin), iType(end) );
+}
+
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >
+  ThreadVectorRange(const Impl::ThreadsExecTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >(thread,count);
+}
+
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember> PerTeam(const Impl::ThreadsExecTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember> PerThread(const Impl::ThreadsExecTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>(thread);
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< !Kokkos::is_reducer< ValueType >::value >::type
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
+parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const ReducerType& reducer) {
+
+  reducer.init(reducer.reference());
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+
+  loop_boundaries.thread.team_reduce(reducer);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
+}
+
+} //namespace Kokkos
+
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< !Kokkos::is_reducer< ValueType >::value >::type
+parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+}
+
+template< typename iType, class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
+parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, const ReducerType& reducer) {
+  reducer.init(reducer.reference());
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& result ) {
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+#endif
+#endif /* #define KOKKOS_THREADSTEAM_HPP */
+
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..731c692968dcd29b329cef26fc83e1e834c4fb54
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@@ -0,0 +1,931 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_PARALLEL_HPP
+#define KOKKOS_THREADS_PARALLEL_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_THREADS )
+
+#include <vector>
+#include <iostream>
+
+#include <Kokkos_Parallel.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/* ParallelFor Kokkos::Threads with RangePolicy */
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Traits ... >
+                 , Kokkos::Threads
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+  typedef typename Policy::work_tag    WorkTag ;
+  typedef typename Policy::WorkRange   WorkRange ;
+  typedef typename Policy::member_type Member ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      const TagType t{} ;
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i );
+      }
+    }
+
+  static void exec( ThreadsExec & exec , const void * arg )
+  {
+    exec_schedule<typename Policy::schedule_type::type>(exec,arg);
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    ParallelFor::template exec_range< WorkTag >
+      ( self.m_functor , range.begin() , range.end() );
+
+    exec.fan_in();
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+
+    while(work_index != -1) {
+      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
+
+      ParallelFor::template exec_range< WorkTag >
+        ( self.m_functor , begin , end );
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::start( & ParallelFor::exec , this );
+      ThreadsExec::fence();
+    }
+
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    {}
+};
+
+
+// MDRangePolicy impl
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::MDRangePolicy< Traits ... >
+                 , Kokkos::Threads
+                 >
+{
+private:
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy         Policy ;
+
+  typedef typename MDRangePolicy::work_tag                  WorkTag ;
+
+  typedef typename Policy::WorkRange   WorkRange ;
+  typedef typename Policy::member_type Member ;
+
+  typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;  // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy 
+            , const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        iterate_type( mdr_policy, functor )( i );
+      }
+    }
+
+  static void exec( ThreadsExec & exec , const void * arg )
+  {
+    exec_schedule<typename Policy::schedule_type::type>(exec,arg);
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    ParallelFor::exec_range
+      ( self.m_mdr_policy, self.m_functor , range.begin() , range.end() );
+
+    exec.fan_in();
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+
+    while(work_index != -1) {
+      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
+
+      ParallelFor::exec_range
+        ( self.m_mdr_policy, self.m_functor , begin , end );
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::start( & ParallelFor::exec , this );
+      ThreadsExec::fence();
+    }
+
+  ParallelFor( const FunctorType & arg_functor
+             , const MDRangePolicy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    {}
+};
+
+//----------------------------------------------------------------------------
+/* ParallelFor Kokkos::Threads with TeamPolicy */
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , Kokkos::TeamPolicy< Properties ... >
+                 , Kokkos::Threads
+                 >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... >  Policy ;
+  typedef typename Policy::work_tag                    WorkTag ;
+  typedef typename Policy::member_type                 Member ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const int          m_shared ;
+
+  template< class TagType , class Schedule>
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value
+  && std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( member );
+      }
+    }
+
+  template< class TagType , class Schedule>
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value
+  && std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      const TagType t{} ;
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( t , member );
+      }
+    }
+
+  template< class TagType , class Schedule>
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value
+  && std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+
+      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
+        functor( member );
+      }
+    }
+
+  template< class TagType , class Schedule>
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value
+                          && std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      const TagType t{} ;
+      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
+        functor( t , member );
+      }
+    }
+
+  static void exec( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    ParallelFor::exec_team< WorkTag , typename Policy::schedule_type::type >
+      ( self.m_functor , Member( & exec , self.m_policy , self.m_shared ) );
+
+    exec.barrier();
+    exec.fan_in();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::resize_scratch( 0 , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelFor::exec , this );
+
+      ThreadsExec::fence();
+    }
+
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    { }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/* ParallelReduce with Kokkos::Threads and RangePolicy */
+
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Threads
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  typedef typename Policy::work_tag    WorkTag ;
+  typedef typename Policy::WorkRange   WorkRange ;
+  typedef typename Policy::member_type Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTagFwd > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type m_result_ptr ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member & ibeg , const Member & iend
+            , reference_type update )
+    {
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i , update );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member & ibeg , const Member & iend
+            , reference_type update )
+    {
+      const TagType t{} ;
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i , update );
+      }
+    }
+
+  static void
+  exec( ThreadsExec & exec , const void * arg ) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    ParallelReduce::template exec_range< WorkTag >
+      ( self.m_functor , range.begin() , range.end()
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTagFwd >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+    exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+    reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
+    while(work_index != -1) {
+      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
+      ParallelReduce::template exec_range< WorkTag >
+        ( self.m_functor , begin , end
+        , update );
+      work_index = exec.get_work_index();
+    }
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTagFwd >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+
+      ThreadsExec::start( & ParallelReduce::exec , this );
+
+      ThreadsExec::fence();
+
+      if ( m_result_ptr ) {
+
+        const pointer_type data =
+          (pointer_type) ThreadsExec::root_reduce_scratch();
+
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
+      }
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.ptr_on_device() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Threads reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Threads reduce result must be a View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
+
+// MDRangePolicy impl
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::MDRangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Threads
+                    >
+{
+private:
+
+  typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy Policy ;
+
+  typedef typename MDRangePolicy::work_tag    WorkTag ;
+  typedef typename Policy::WorkRange   WorkRange ;
+  typedef typename Policy::member_type Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTagFwd > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
+                                                                           , FunctorType
+                                                                           , WorkTag
+                                                                           , reference_type
+                                                                           >;
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;  // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy
+            , const FunctorType & functor
+            , const Member & ibeg , const Member & iend
+            , reference_type update )
+    {
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        iterate_type( mdr_policy, functor, update )( i );
+      }
+    }
+
+  static void
+  exec( ThreadsExec & exec , const void * arg ) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    ParallelReduce::exec_range
+      ( self.m_mdr_policy, self.m_functor , range.begin() , range.end()
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTagFwd >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+    exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+    reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
+    while(work_index != -1) {
+      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
+      ParallelReduce::exec_range
+        ( self.m_mdr_policy, self.m_functor , begin , end
+        , update );
+      work_index = exec.get_work_index();
+    }
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTagFwd >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+
+      ThreadsExec::start( & ParallelReduce::exec , this );
+
+      ThreadsExec::fence();
+
+      if ( m_result_ptr ) {
+
+        const pointer_type data =
+          (pointer_type) ThreadsExec::root_reduce_scratch();
+
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
+      }
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const MDRangePolicy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.ptr_on_device() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Threads reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Threads reduce result must be a View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
+
+//----------------------------------------------------------------------------
+/* ParallelReduce with Kokkos::Threads and TeamPolicy */
+
+template< class FunctorType , class ReducerType, class ... Properties >
+class ParallelReduce< FunctorType
+                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
+                    , Kokkos::Threads
+                    >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... >              Policy ;
+  typedef typename Policy::work_tag                                WorkTag ;
+  typedef typename Policy::member_type                             Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+  typedef typename Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, WorkTag, void>::type WorkTagFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTagFwd > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const ReducerType  m_reducer ;
+  const pointer_type m_result_ptr ;
+  const int          m_shared ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member , reference_type update )
+    {
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( member , update );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member , reference_type update )
+    {
+      const TagType t{} ;
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( t , member , update );
+      }
+    }
+
+  static void exec( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    ParallelReduce::template exec_team< WorkTag >
+      ( self.m_functor , Member( & exec , self.m_policy , self.m_shared )
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTagFwd >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelReduce::exec , this );
+
+      ThreadsExec::fence();
+
+      if ( m_result_ptr ) {
+
+        const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
+
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
+      }
+    }
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const ViewType     & arg_result ,
+                  typename std::enable_if<
+                    Kokkos::is_view< ViewType >::value &&
+                    !Kokkos::is_reducer_type<ReducerType>::value
+                    ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result.ptr_on_device() )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    {}
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.view().data() )
+  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/* ParallelScan with Kokkos::Threads and RangePolicy */
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::Threads
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+  typedef typename Policy::WorkRange                               WorkRange ;
+  typedef typename Policy::work_tag                                WorkTag ;
+  typedef typename Policy::member_type                             Member ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member & ibeg , const Member & iend
+            , reference_type update , const bool final )
+    {
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i , update , final );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member & ibeg , const Member & iend
+            , reference_type update , const bool final )
+    {
+      const TagType t{} ;
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i , update , final );
+      }
+    }
+
+  static void exec( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelScan & self = * ((const ParallelScan *) arg );
+
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    reference_type update =
+      ValueInit::init( self.m_functor , exec.reduce_memory() );
+
+    ParallelScan::template exec_range< WorkTag >
+      ( self.m_functor , range.begin(), range.end(), update, false );
+
+    //  exec.template scan_large<FunctorType,WorkTag>( self.m_functor );
+    exec.template scan_small<FunctorType,WorkTag>( self.m_functor );
+
+    ParallelScan::template exec_range< WorkTag >
+      ( self.m_functor , range.begin(), range.end(), update, true );
+
+    exec.fan_in();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
+      ThreadsExec::start( & ParallelScan::exec , this );
+      ThreadsExec::fence();
+    }
+
+  ParallelScan( const FunctorType & arg_functor
+              , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */
+
diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..934d2db2ca83ccbd2dc1062ab71c48c14f1f8542
--- /dev/null
+++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
@@ -0,0 +1,117 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
+#define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::Threads
+                 >
+{
+private:
+
+  typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
+
+  typedef ParallelFor<FunctorType,
+                      Kokkos::WorkGraphPolicy<Traits ...>,
+                      Kokkos::Threads> Self ;
+
+  Policy       m_policy ;
+  FunctorType  m_functor ;
+
+  template< class TagType >
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }
+
+  template< class TagType >
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{}; m_functor( t , w ); }
+
+  inline void exec_one_thread() const noexcept 
+    {
+      // Spin until COMPLETED_TOKEN.
+      // END_TOKEN indicates no work is currently available.
+      
+      for ( std::int32_t w = Policy::END_TOKEN ;
+            Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+        if ( Policy::END_TOKEN != w ) {
+          exec_one< typename Policy::work_tag >( w );
+          m_policy.completed_work(w);
+        }
+      }
+    }
+
+  static inline void thread_main( ThreadsExec&, const void* arg ) noexcept
+    {
+      const Self& self = *(static_cast<const Self*>(arg));
+      self.exec_one_thread();
+    }
+
+public:
+
+  inline
+  void execute()
+  {
+    ThreadsExec::start( & Self::thread_main, this );
+    ThreadsExec::fence();
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP */
diff --git a/packages/kokkos/core/src/impl/CMakeLists.txt b/packages/kokkos/core/src/impl/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c543194de3993015f6940506c0ff51da157f2084
--- /dev/null
+++ b/packages/kokkos/core/src/impl/CMakeLists.txt
@@ -0,0 +1,18 @@
+
+SET(HEADERS "")
+SET(SOURCES "")
+
+FILE(GLOB HEADERS *.hpp)
+FILE(GLOB SOURCES *.cpp)
+
+TRIBITS_ADD_LIBRARY(
+    kokkoscore_impl
+    NOINSTALLHEADERS ${HEADERS}
+    SOURCES ${SOURCES}
+    DEPLIBS 
+    )
+
+SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
+
+INSTALL(FILES ${HEADERS} DESTINATION ${TRILINOS_INCDIR}/impl/)
+
diff --git a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..258fd0787288e7cf7f5a73e95f2fc3fe217f3950
--- /dev/null
+++ b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@@ -0,0 +1,2897 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HOST_EXP_ITERATE_TILE_HPP
+#define KOKKOS_HOST_EXP_ITERATE_TILE_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
+#define KOKKOS_MDRANGE_IVDEP
+#endif
+
+#ifdef KOKKOS_MDRANGE_IVDEP
+ #define KOKKOS_ENABLE_IVDEP_MDRANGE _Pragma("ivdep")
+#else
+ #define KOKKOS_ENABLE_IVDEP_MDRANGE
+#endif
+
+#include <iostream>
+#include <algorithm>
+#include <cstdio>
+
+namespace Kokkos { namespace Impl {
+
+// Temporary, for testing new loop macros
+#define KOKKOS_ENABLE_NEW_LOOP_MACROS 1
+
+
+#define LOOP_1L(type, tile) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE \
+  for( type i0=0; i0<static_cast<type>(tile[0]); ++i0)
+
+#define LOOP_2L(type, tile) \
+  for( type i1=0; i1<static_cast<type>(tile[1]); ++i1) \
+  LOOP_1L(type, tile)
+
+#define LOOP_3L(type, tile) \
+  for( type i2=0; i2<static_cast<type>(tile[2]); ++i2) \
+  LOOP_2L(type, tile)
+
+#define LOOP_4L(type, tile) \
+  for( type i3=0; i3<static_cast<type>(tile[3]); ++i3) \
+  LOOP_3L(type, tile)
+
+#define LOOP_5L(type, tile) \
+  for( type i4=0; i4<static_cast<type>(tile[4]); ++i4) \
+  LOOP_4L(type, tile)
+
+#define LOOP_6L(type, tile) \
+  for( type i5=0; i5<static_cast<type>(tile[5]); ++i5) \
+  LOOP_5L(type, tile)
+
+#define LOOP_7L(type, tile) \
+  for( type i6=0; i6<static_cast<type>(tile[6]); ++i6) \
+  LOOP_6L(type, tile)
+
+#define LOOP_8L(type, tile) \
+  for( type i7=0; i7<static_cast<type>(tile[7]); ++i7) \
+  LOOP_7L(type, tile)
+
+
+#define LOOP_1R(type, tile) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE \
+  for ( type i0=0; i0<static_cast<type>(tile[0]); ++i0 )
+
+#define LOOP_2R(type, tile) \
+  LOOP_1R(type, tile) \
+  for ( type i1=0; i1<static_cast<type>(tile[1]); ++i1 )
+
+#define LOOP_3R(type, tile) \
+  LOOP_2R(type, tile) \
+  for ( type i2=0; i2<static_cast<type>(tile[2]); ++i2 )
+
+#define LOOP_4R(type, tile) \
+  LOOP_3R(type, tile) \
+  for ( type i3=0; i3<static_cast<type>(tile[3]); ++i3 )
+
+#define LOOP_5R(type, tile) \
+  LOOP_4R(type, tile) \
+  for ( type i4=0; i4<static_cast<type>(tile[4]); ++i4 )
+
+#define LOOP_6R(type, tile) \
+  LOOP_5R(type, tile) \
+  for ( type i5=0; i5<static_cast<type>(tile[5]); ++i5 )
+
+#define LOOP_7R(type, tile) \
+  LOOP_6R(type, tile) \
+  for ( type i6=0; i6<static_cast<type>(tile[6]); ++i6 )
+
+#define LOOP_8R(type, tile) \
+  LOOP_7R(type, tile) \
+  for ( type i7=0; i7<static_cast<type>(tile[7]); ++i7 )
+
+
+#define LOOP_ARGS_1 i0 + m_offset[0]
+#define LOOP_ARGS_2 LOOP_ARGS_1, i1 + m_offset[1]
+#define LOOP_ARGS_3 LOOP_ARGS_2, i2 + m_offset[2]
+#define LOOP_ARGS_4 LOOP_ARGS_3, i3 + m_offset[3]
+#define LOOP_ARGS_5 LOOP_ARGS_4, i4 + m_offset[4]
+#define LOOP_ARGS_6 LOOP_ARGS_5, i5 + m_offset[5]
+#define LOOP_ARGS_7 LOOP_ARGS_6, i6 + m_offset[6]
+#define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7]
+
+
+// New Loop Macros...
+// parallel_for, non-tagged
+#define APPLY( func, ... ) \
+  func( __VA_ARGS__ );
+
+// LayoutRight
+// d = 0 to start
+#define LOOP_R_1( func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY( func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define LOOP_R_2( func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_R_1( func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define LOOP_R_3( func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_R_2( func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define LOOP_R_4( func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_R_3( func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define LOOP_R_5( func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_R_4( func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define LOOP_R_6( func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_R_5( func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define LOOP_R_7( func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_R_6( func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define LOOP_R_8( func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_R_7( func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define LOOP_L_1( func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY( func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define LOOP_L_2( func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_L_1( func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_3( func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_L_2( func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_4( func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_L_3( func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_5( func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_L_4( func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_6( func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_L_5( func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_7( func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_L_6( func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_8( func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_L_7( func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+// TODO: rank not necessary to pass through, can hardcode the values
+#define LOOP_LAYOUT_1( func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    APPLY( func, i0 + m_offset[0] )              \
+  }
+
+#define LOOP_LAYOUT_2( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      LOOP_L_1( func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      LOOP_R_1( func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_3( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      LOOP_L_2( func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      LOOP_R_2( func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_4( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      LOOP_L_3( func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      LOOP_R_3( func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_5( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      LOOP_L_4( func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      LOOP_R_4( func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_6( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      LOOP_L_5( func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      LOOP_R_5( func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_7( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      LOOP_L_6( func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      LOOP_R_6( func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_8( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      LOOP_L_7( func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      LOOP_R_7( func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  }
+
+// Partial vs Full Tile
+#define TILE_LOOP_1( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_1( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_1( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_2( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_2( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_2( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_3( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_3( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_3( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_4( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_4( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_4( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_5( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_5( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_5( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_6( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_6( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_6( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_7( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_7( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_7( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_8( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_8( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_8( func, type, is_left, m_offset, extent_partial, rank ) }
+
+
+// parallel_reduce, non-tagged
+// Reduction version
+#define APPLY_REDUX( val, func, ... ) \
+  func( __VA_ARGS__, val );
+
+// LayoutRight
+// d = 0 to start
+#define LOOP_R_1_REDUX( val, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY_REDUX( val, func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define LOOP_R_2_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_R_1_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define LOOP_R_3_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_R_2_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define LOOP_R_4_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_R_3_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define LOOP_R_5_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_R_4_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define LOOP_R_6_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_R_5_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define LOOP_R_7_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_R_6_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define LOOP_R_8_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_R_7_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define LOOP_L_1_REDUX( val, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY_REDUX( val, func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define LOOP_L_2_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_L_1_REDUX( val, func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_3_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_L_2_REDUX( val, func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_4_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_L_3_REDUX( val, func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_5_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_L_4_REDUX( val, func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_6_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_L_5_REDUX( val, func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_7_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_L_6_REDUX( val, func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_8_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_L_7_REDUX( val, func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+#define LOOP_LAYOUT_1_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    APPLY_REDUX( val, func, i0 + m_offset[0] )              \
+  }
+
+#define LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      LOOP_L_1_REDUX( val, func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      LOOP_R_1_REDUX( val, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      LOOP_L_2_REDUX( val, func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      LOOP_R_2_REDUX( val, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      LOOP_L_3_REDUX( val, func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      LOOP_R_3_REDUX( val, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      LOOP_L_4_REDUX( val, func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      LOOP_R_4_REDUX( val, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      LOOP_L_5_REDUX( val, func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      LOOP_R_5_REDUX( val, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      LOOP_L_6_REDUX( val, func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      LOOP_R_6_REDUX( val, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  }
+
+#define LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      LOOP_L_7_REDUX( val, func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      LOOP_R_7_REDUX( val, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  }
+
+// Partial vs Full Tile
+#define TILE_LOOP_1_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_1_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_1_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_2_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_3_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_4_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_5_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_6_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_7_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_8_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+// end New Loop Macros
+
+
+// tagged macros
+#define TAGGED_APPLY( tag, func, ... ) \
+  func( tag, __VA_ARGS__ );
+
+// LayoutRight
+// d = 0 to start
+#define TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY( tag, func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_8( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define TAGGED_LOOP_L_1( tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY( tag, func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define TAGGED_LOOP_L_2( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_L_1( tag, func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_3( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_L_2( tag, func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_4( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_L_3( tag, func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_5( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_L_4( tag, func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_6( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_L_5( tag, func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_7( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_L_6( tag, func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_8( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_L_7( tag, func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+// TODO: rank not necessary to pass through, can hardcode the values
+#define TAGGED_LOOP_LAYOUT_1( tag, func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    TAGGED_APPLY( tag, func, i0 + m_offset[0] )              \
+  }
+
+#define TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      TAGGED_LOOP_L_1( tag, func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      TAGGED_LOOP_L_2( tag, func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      TAGGED_LOOP_L_3( tag, func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      TAGGED_LOOP_L_4( tag, func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      TAGGED_LOOP_L_5( tag, func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      TAGGED_LOOP_L_6( tag, func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      TAGGED_LOOP_L_7( tag, func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  }
+
+// Partial vs Full Tile
+#define TAGGED_TILE_LOOP_1( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_1( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_1( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_2( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_3( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_4( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_5( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_6( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_7( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_8( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+
+// parallel_reduce, tagged
+// Reduction version
+#define TAGGED_APPLY_REDUX( val, tag, func, ... ) \
+  func( tag, __VA_ARGS__, val );
+
+// LayoutRight
+// d = 0 to start
+#define TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY_REDUX( val, tag, func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_8_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define TAGGED_LOOP_L_1_REDUX( val, tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY_REDUX( val, tag, func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define TAGGED_LOOP_L_2_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_L_1_REDUX( val, tag, func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_3_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_L_2_REDUX( val, tag, func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_4_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_L_3_REDUX( val, tag, func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_5_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_L_4_REDUX( val, tag, func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_6_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_L_5_REDUX( val, tag, func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_7_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_L_6_REDUX( val, tag, func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_8_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_L_7_REDUX( val, tag, func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+#define TAGGED_LOOP_LAYOUT_1_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    TAGGED_APPLY_REDUX( val, tag, func, i0 + m_offset[0] )              \
+  }
+
+#define TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      TAGGED_LOOP_L_1_REDUX( val, tag, func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      TAGGED_LOOP_L_2_REDUX( val, tag, func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      TAGGED_LOOP_L_3_REDUX( val, tag, func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      TAGGED_LOOP_L_4_REDUX( val, tag, func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      TAGGED_LOOP_L_5_REDUX( val, tag, func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      TAGGED_LOOP_L_6_REDUX( val, tag, func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  }
+
+#define TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      TAGGED_LOOP_L_7_REDUX( val, tag, func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  }
+
+// Partial vs Full Tile
+#define TAGGED_TILE_LOOP_1_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_1_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_1_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_2_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_3_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_4_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_5_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_6_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_7_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_8_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+// end tagged macros
+
+
+// Structs for calling loops
+template < int Rank, bool IsLeft, typename IType, typename Tagged, typename Enable = void >
+struct Tile_Loop_Type;
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<1, IsLeft, IType, void, void >
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_1( func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_1_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<2, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_2( func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_2_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<3, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_3( func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_3_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<4, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_4( func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_4_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<5, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_5( func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_5_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<6, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_6( func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_6_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<7, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_7( func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_7_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<8, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_8( func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_8_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+};
+
+// tagged versions
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<1, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type >
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_1( Tagged(), func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_1_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<2, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_2( Tagged(), func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_2_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<3, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_3( Tagged(), func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_3_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<4, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_4( Tagged(), func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_4_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<5, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_5( Tagged(), func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_5_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<6, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_6( Tagged(), func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_6_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<7, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_7( Tagged(), func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_7_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_8( Tagged(), func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_8_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+};
+// end Structs for calling loops
+
+
+template <typename T>
+using is_void_type = std::is_same< T , void >;
+
+template <typename T>
+struct is_type_array : std::false_type 
+{
+  using value_type = T;
+};
+
+template <typename T>
+struct is_type_array< T[] > : std::true_type
+{
+  using value_type = T;
+};
+
+
+template < typename RP
+         , typename Functor
+         , typename Tag = void
+         , typename ValueType = void
+         , typename Enable = void
+         >
+struct HostIterateTile;
+
+//For ParallelFor
+template < typename RP
+         , typename Functor
+         , typename Tag
+         , typename ValueType
+         >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void_type<ValueType >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using point_type = typename RP::point_type;
+
+  using value_type = ValueType;
+
+  inline
+  HostIterateTile( RP const& rp, Functor const& func )
+    : m_rp(rp)
+    , m_func(func)
+  {
+  }
+
+  inline
+  bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
+    bool is_full_tile = true;
+
+      for ( int i = 0; i < RP::rank; ++i ) {
+        if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
+            partial_tile[i] = m_rp.m_tile[i] ;
+        }
+        else {
+          is_full_tile = false ;
+            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
+                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
+                            : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
+        }
+      }
+
+    return is_full_tile ;
+  } // end check bounds
+
+
+  template <int Rank>
+  struct RankTag
+  {
+    typedef RankTag type;
+    enum { value = (int)Rank };
+  };
+
+#if KOKKOS_ENABLE_NEW_LOOP_MACROS
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
+
+  }
+
+#else
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { operator_impl( tile_idx , RankTag<RP::rank>() ); }
+  // added due to compiler error when using sfinae to choose operator based on rank w/ cuda+serial
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<2> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 2
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<3> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 3
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<4> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 4
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<5> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 5
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<6> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 6
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<7> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 7
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<8> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 8
+#endif
+
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func(args...);
+    }
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func( m_tag, args...);
+    }
+
+
+  RP         const& m_rp;
+  Functor    const& m_func;
+  typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
+};
+
+
+// For ParallelReduce
+// ValueType - scalar: For reductions
+template < typename RP
+         , typename Functor
+         , typename Tag
+         , typename ValueType
+         >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && !is_type_array<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using point_type = typename RP::point_type;
+
+  using value_type = ValueType;
+
+  inline
+  HostIterateTile( RP const& rp, Functor const& func, value_type & v )
+    : m_rp(rp) //Cuda 7.0 does not like braces...
+    , m_func(func)
+    , m_v(v) // use with non-void ValueType struct
+  {
+// Errors due to braces rather than parenthesis for init (with cuda 7.0)
+//      /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: error: too many braces around initializer for ‘int’ [-fpermissive]
+//      /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: error: aggregate value used where an integer was expected
+  }
+
+  inline
+  bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
+    bool is_full_tile = true;
+
+      for ( int i = 0; i < RP::rank; ++i ) {
+        if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
+            partial_tile[i] = m_rp.m_tile[i] ;
+        }
+        else {
+          is_full_tile = false ;
+            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
+                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
+                            : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
+        }
+      }
+
+    return is_full_tile ;
+  } // end check bounds
+
+
+  template <int Rank>
+  struct RankTag
+  {
+    typedef RankTag type;
+    enum { value = (int)Rank };
+  };
+
+
+#if KOKKOS_ENABLE_NEW_LOOP_MACROS
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
+
+  }
+
+#else
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { operator_impl( tile_idx , RankTag<RP::rank>() ); }
+  // added due to compiler error when using sfinae to choose operator based on rank
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<2> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 2
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<3> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 3
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<4> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 4
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<5> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 5
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<6> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 6
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<7> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 7
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<8> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 8
+#endif
+
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func(args... , m_v);
+    }
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func( m_tag, args... , m_v);
+    }
+
+
+  RP         const& m_rp;
+  Functor    const& m_func;
+  value_type  & m_v;
+  typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
+
+};
+
+
+// For ParallelReduce
+// Extra specialization for array reductions
+// ValueType[]: For array reductions
+template < typename RP
+         , typename Functor
+         , typename Tag
+         , typename ValueType
+         >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && is_type_array<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using point_type = typename RP::point_type;
+
+  using value_type = typename is_type_array<ValueType>::value_type; // strip away the 'array-ness' [], only underlying type remains
+
+  inline
+  HostIterateTile( RP const& rp, Functor const& func, value_type *v ) // v should be an array; treat as pointer for compatibility since size is not known nor needed here
+    : m_rp(rp) //Cuda 7.0 does not like braces...
+    , m_func(func)
+    , m_v(v) // use with non-void ValueType struct
+  {}
+
+  inline
+  bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
+    bool is_full_tile = true;
+
+      for ( int i = 0; i < RP::rank; ++i ) {
+        if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
+            partial_tile[i] = m_rp.m_tile[i] ;
+        }
+        else {
+          is_full_tile = false ;
+            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
+                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
+                            : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
+        }
+      }
+
+    return is_full_tile ;
+  } // end check bounds
+
+
+  template <int Rank>
+  struct RankTag
+  {
+    typedef RankTag type;
+    enum { value = (int)Rank };
+  };
+
+
+#if KOKKOS_ENABLE_NEW_LOOP_MACROS
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
+
+  }
+
+#else
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { operator_impl( tile_idx , RankTag<RP::rank>() ); }
+  // added due to compiler error when using sfinae to choose operator based on rank
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<2> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 2
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<3> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 3
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<4> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 4
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<5> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 5
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<6> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 6
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<7> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 7
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<8> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 8
+#endif
+
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func(args... , m_v);
+    }
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func( m_tag, args... , m_v);
+    }
+
+
+  RP         const& m_rp;
+  Functor    const& m_func;
+  value_type * m_v;
+  typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
+
+};
+
+
+// ------------------------------------------------------------------ //
+
+// MDFunctor - wraps the range_policy and functor to pass to IterateTile
+// Used for md_parallel_{for,reduce} with Serial, Threads, OpenMP
+// Cuda uses DeviceIterateTile directly within md_parallel_for
+// TODO Once md_parallel_{for,reduce} removed, this can be removed
+
+namespace Experimental { 
+
+// ParallelReduce - scalar reductions
+template < typename MDRange, typename Functor, typename ValueType = void >
+struct MDFunctor
+{
+  using range_policy = MDRange;
+  using functor_type = Functor;
+  using value_type   = ValueType;
+  using work_tag     = typename range_policy::work_tag;
+  using index_type   = typename range_policy::index_type;
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
+                                                             , Functor
+                                                             , work_tag
+                                                             , value_type
+                                                             >;
+
+
+  inline
+  MDFunctor( MDRange const& range, Functor const& f )
+    : m_range( range )
+    , m_func( f )
+  {}
+
+  inline
+  MDFunctor( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor( MDFunctor && ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor && ) = default;
+
+  inline
+  void operator()(index_type t, value_type & v) const
+  {
+    iterate_type(m_range, m_func, v)(t);
+  }
+
+  MDRange   m_range;
+  Functor   m_func;
+};
+
+
+// ParallelReduce - array reductions 
+template < typename MDRange, typename Functor, typename ValueType >
+struct MDFunctor< MDRange, Functor, ValueType[] >
+{
+  using range_policy = MDRange;
+  using functor_type = Functor;
+  using value_type   = ValueType[];
+  using work_tag     = typename range_policy::work_tag;
+  using index_type   = typename range_policy::index_type;
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
+                                                             , Functor
+                                                             , work_tag
+                                                             , value_type
+                                                             >;
+
+
+  inline
+  MDFunctor( MDRange const& range, Functor const& f )
+    : m_range( range )
+    , m_func( f )
+    , value_count( f.value_count )
+  {}
+
+  inline
+  MDFunctor( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor( MDFunctor && ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor && ) = default;
+
+  // FIXME Init and Join, as defined in m_func, are not working through the MDFunctor
+  // Best path forward is to eliminate need for MDFunctor, directly use MDRangePolicy within Parallel{For,Reduce} ??
+  inline
+  void operator()(index_type t, value_type v) const
+  {
+    iterate_type(m_range, m_func, v)(t);
+  }
+
+  MDRange   m_range;
+  Functor   m_func;
+  size_t    value_count;
+};
+
+
+// ParallelFor
+template < typename MDRange, typename Functor >
+struct MDFunctor< MDRange, Functor, void >
+{
+  using range_policy = MDRange;
+  using functor_type = Functor;
+  using work_tag     = typename range_policy::work_tag;
+  using index_type   = typename range_policy::index_type;
+  using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
+                                                             , Functor
+                                                             , work_tag
+                                                             , void
+                                                             >;
+
+
+  inline
+  MDFunctor( MDRange const& range, Functor const& f )
+    : m_range( range )
+    , m_func( f )
+  {}
+
+  inline
+  MDFunctor( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor( MDFunctor && ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor && ) = default;
+
+  inline
+  void operator()(index_type t) const
+  {
+    iterate_type(m_range, m_func)(t);
+  }
+
+  MDRange m_range;
+  Functor m_func;
+};
+
+} // end namespace Experimental
+#undef KOKKOS_ENABLE_NEW_LOOP_MACROS
+
+} } //end namespace Kokkos::Impl
+
+#endif
diff --git a/packages/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp b/packages/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0bfc5841dfb0487419c20c9849cb00f68db3cef9
--- /dev/null
+++ b/packages/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Deprecated file for backward compatibility
+
+#include <impl/Kokkos_ViewMapping.hpp>
diff --git a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d27c2e13061fc33d3bec1f1b9d3955448db70a1a
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@@ -0,0 +1,229 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ANALYZE_POLICY_HPP
+#define KOKKOS_IMPL_ANALYZE_POLICY_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Concepts.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+namespace Kokkos { namespace Impl {
+
+template < typename ExecutionSpace   = void
+         , typename Schedule         = void
+         , typename WorkTag          = void
+         , typename IndexType        = void
+         , typename IterationPattern = void
+         , typename LaunchBounds     = void
+         >
+struct PolicyTraitsBase
+{
+  using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, 
+               IterationPattern, LaunchBounds>;
+
+  using execution_space   = ExecutionSpace;
+  using schedule_type     = Schedule;
+  using work_tag          = WorkTag;
+  using index_type        = IndexType;
+  using iteration_pattern = IterationPattern;
+  using launch_bounds     = LaunchBounds;
+};
+
+
+template <typename PolicyBase, typename ExecutionSpace>
+struct SetExecutionSpace
+{
+  static_assert( is_void<typename PolicyBase::execution_space>::value
+               , "Kokkos Error: More than one execution space given" );
+  using type = PolicyTraitsBase< ExecutionSpace
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
+                               >;
+};
+
+template <typename PolicyBase, typename Schedule>
+struct SetSchedule
+{
+  static_assert( is_void<typename PolicyBase::schedule_type>::value
+               , "Kokkos Error: More than one schedule type given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , Schedule
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
+                               >;
+};
+
+template <typename PolicyBase, typename WorkTag>
+struct SetWorkTag
+{
+  static_assert( is_void<typename PolicyBase::work_tag>::value
+               , "Kokkos Error: More than one work tag given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , WorkTag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
+                               >;
+};
+
+template <typename PolicyBase, typename IndexType>
+struct SetIndexType
+{
+  static_assert( is_void<typename PolicyBase::index_type>::value
+               , "Kokkos Error: More than one index type given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , IndexType
+                               , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
+                               >;
+};
+
+
+template <typename PolicyBase, typename IterationPattern>
+struct SetIterationPattern
+{
+  static_assert( is_void<typename PolicyBase::iteration_pattern>::value
+               , "Kokkos Error: More than one iteration_pattern given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , IterationPattern
+                               , typename PolicyBase::launch_bounds
+                               >;
+};
+
+
+template <typename PolicyBase, typename LaunchBounds>
+struct SetLaunchBounds
+{
+  static_assert( is_void<typename PolicyBase::launch_bounds>::value
+               , "Kokkos Error: More than one launch_bounds given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               , LaunchBounds
+                               >;
+};
+
+
+template <typename Base, typename... Traits>
+struct AnalyzePolicy;
+
+template <typename Base, typename T, typename... Traits>
+struct AnalyzePolicy<Base, T, Traits...> : public
+  AnalyzePolicy<
+      typename std::conditional< is_execution_space<T>::value  , SetExecutionSpace<Base,T>
+    , typename std::conditional< is_schedule_type<T>::value    , SetSchedule<Base,T>
+    , typename std::conditional< is_index_type<T>::value       , SetIndexType<Base,T>
+    , typename std::conditional< std::is_integral<T>::value    , SetIndexType<Base, IndexType<T> >
+    , typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
+    , typename std::conditional< is_launch_bounds<T>::value    , SetLaunchBounds<Base,T>
+    , SetWorkTag<Base,T>
+    >::type >::type >::type >::type >::type>::type::type
+  , Traits...
+  >
+{};
+
+template <typename Base>
+struct AnalyzePolicy<Base>
+{
+  using execution_space = typename std::conditional< is_void< typename Base::execution_space >::value
+                                                   , DefaultExecutionSpace
+                                                   , typename Base::execution_space
+                                                   >::type;
+
+  using schedule_type = typename std::conditional< is_void< typename Base::schedule_type >::value
+                                                 , Schedule< Static >
+                                                 , typename Base::schedule_type
+                                                 >::type;
+
+  using work_tag = typename Base::work_tag;
+
+  using index_type = typename std::conditional< is_void< typename Base::index_type >::value
+                                              , IndexType< typename execution_space::size_type >
+                                              , typename Base::index_type
+                                              >::type
+                                               ::type // nasty hack to make index_type into an integral_type
+                                              ;       // instead of the wrapped IndexType<T> for backwards compatibility
+
+  using iteration_pattern = typename std::conditional< is_void< typename Base::iteration_pattern >::value
+                                                     , void // TODO set default iteration pattern
+                                                     , typename Base::iteration_pattern
+                                                     >::type;
+
+  using launch_bounds = typename std::conditional< is_void< typename Base::launch_bounds >::value
+                                                     , LaunchBounds<>
+                                                     , typename Base::launch_bounds
+                                                     >::type;
+
+  using type = PolicyTraitsBase< execution_space
+                               , schedule_type
+                               , work_tag
+                               , index_type
+                               , iteration_pattern
+                               , launch_bounds
+                               >;
+};
+
+template <typename... Traits>
+struct PolicyTraits
+  : public AnalyzePolicy< PolicyTraitsBase<>, Traits... >::type
+{};
+
+}} // namespace Kokkos::Impl
+
+
+#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c94309162219adf1250e84f00aa38d3e600d8866
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp
@@ -0,0 +1,115 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_HPP )
+#define KOKKOS_ATOMIC_ASSEMBLY_HPP
+namespace Kokkos {
+
+namespace Impl {
+  struct cas128_t
+  {
+    uint64_t lower;
+    uint64_t upper;
+
+    KOKKOS_INLINE_FUNCTION
+    cas128_t () {
+      lower = 0;
+      upper = 0;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    cas128_t (const cas128_t& a) {
+      lower = a.lower;
+      upper = a.upper;
+    }
+    KOKKOS_INLINE_FUNCTION
+    cas128_t (volatile cas128_t* a) {
+      lower = a->lower;
+      upper = a->upper;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    bool operator != (const cas128_t& a) const {
+      return (lower != a.lower) || upper!=a.upper;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator = (const cas128_t& a) {
+      lower = a.lower;
+      upper = a.upper;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator = (const cas128_t& a) volatile {
+      lower = a.lower;
+      upper = a.upper;
+    }
+  }
+  __attribute__ (( __aligned__( 16 ) ));
+
+
+  #if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+  inline cas128_t cas128( volatile cas128_t * ptr, cas128_t cmp,  cas128_t swap )
+  {
+      bool swapped = false;
+      __asm__ __volatile__
+      (
+       "lock cmpxchg16b %1\n\t"
+       "setz %0"
+       : "=q" ( swapped )
+       , "+m" ( *ptr )
+       , "+d" ( cmp.upper )
+       , "+a" ( cmp.lower )
+       : "c" ( swap.upper )
+       , "b" ( swap.lower )
+       , "q" ( swapped )
+     );
+      return cmp;
+  }
+  #endif
+
+}
+}
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad115dd8ffc96efc25a9ac2fd316d13a0b58dfc3
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@@ -0,0 +1,313 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
+#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
+
+#if defined(KOKKOS_ENABLE_CUDA)
+#include<Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
+#endif
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+// Cuda native CAS supports int, unsigned int, and unsigned long long int (non-standard type).
+// Must cast-away 'volatile' for the CAS call.
+
+#if defined( KOKKOS_ENABLE_CUDA )
+
+#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+__inline__ __device__
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{ return atomicCAS((int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val)
+{ return atomicCAS((unsigned int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned long long int atomic_compare_exchange( volatile unsigned long long int * const dest ,
+                                                const unsigned long long int compare ,
+                                                const unsigned long long int val )
+{ return atomicCAS((unsigned long long int*)dest,compare,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  const int tmp = atomicCAS( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  const type tmp = atomicCAS( (type*) dest , *((type*)&compare) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+    typename Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  int done = 0;
+  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
+  unsigned int done_active = 0;
+  while (active!=done_active) {
+    if(!done) {
+      if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+        return_val = *dest;
+        if( return_val == compare )
+          *dest = val;
+        Impl::unlock_address_cuda_space( (void*) dest );
+        done = 1;
+      }
+    }
+    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
+  }
+  return return_val;
+}
+#endif
+#endif
+
+//----------------------------------------------------------------------------
+// GCC native CAS supports int, long, unsigned int, unsigned long.
+// Intel native CAS support int and long with the same interface as GCC.
+#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
+#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
+
+inline
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_val_compare_and_swap(dest,compare,val);
+}
+
+inline
+long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_val_compare_and_swap(dest,compare,val);
+}
+
+#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
+
+// GCC supports unsigned
+
+inline
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+inline
+unsigned long atomic_compare_exchange( volatile unsigned long * const dest ,
+                                       const unsigned long compare ,
+                                       const unsigned long val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+#endif
+
+template < typename T >
+inline
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  union U {
+    int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return tmp.t ;
+}
+
+template < typename T >
+inline
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T & >::type val )
+{
+  union U {
+    long i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
+  return tmp.t ;
+}
+
+#if defined( KOKKOS_ENABLE_ASM) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+template < typename T >
+inline
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(long) &&
+                                    sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) );
+  return tmp.t ;
+}
+#endif
+
+template < typename T >
+inline
+T atomic_compare_exchange( volatile T * const dest , const T compare ,
+    typename Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+            #if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+               && ( sizeof(T) != 16 )
+            #endif
+             , const T >::type& val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  if( return_val == compare ) {
+    // Don't use the following line of code here:
+    //
+    //const T tmp = *dest = val;
+    //
+    // Instead, put each assignment in its own statement.  This is
+    // because the overload of T::operator= for volatile *this should
+    // return void, not volatile T&.  See Kokkos #177:
+    //
+    // https://github.com/kokkos/kokkos/issues/177
+    *dest = val;
+    const T tmp = *dest;
+    #ifndef KOKKOS_COMPILER_CLANG
+    (void) tmp;
+    #endif
+  }
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T compare, const T val )
+{
+  T retval;
+#pragma omp critical
+  {
+    retval = dest[0];
+    if ( retval == compare )
+        dest[0] = val;
+  }
+  return retval;
+}
+
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest_v, const T compare, const T val )
+{
+  T* dest = const_cast<T*>(dest_v);
+  T retval = *dest;
+  if (retval == compare) *dest = val;
+  return retval;
+}
+
+#endif
+#endif
+#endif // !defined ROCM_ATOMICS
+
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+{
+  return compare == atomic_compare_exchange(dest, compare, val);
+}
+//----------------------------------------------------------------------------
+
+} // namespace Kokkos
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f3dc52fcec33ab994c301b21f5906bc2a832b15
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
@@ -0,0 +1,155 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP )
+#define KOKKOS_ATOMIC_DECREMENT_HPP
+
+#include "impl/Kokkos_Atomic_Fetch_Sub.hpp"
+
+namespace Kokkos {
+
+// Atomic increment
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<char>(volatile char* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+  __asm__ __volatile__(
+      "lock decb %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+  char* a_nv = const_cast<char*>(a);
+  --(*a_nv);
+#else
+  Kokkos::atomic_fetch_sub(a, char(1));
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<short>(volatile short* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+  __asm__ __volatile__(
+      "lock decw %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+  short* a_nv = const_cast<short*>(a);
+  --(*a_nv);
+#else
+  Kokkos::atomic_fetch_sub(a, short(1));
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<int>(volatile int* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+  __asm__ __volatile__(
+      "lock decl %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+  int* a_nv = const_cast<int*>(a);
+  --(*a_nv);
+#else
+  Kokkos::atomic_fetch_sub(a, int(1));
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<long long int>(volatile long long int* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+  __asm__ __volatile__(
+      "lock decq %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+  long long int* a_nv = const_cast<long long int*>(a);
+  --(*a_nv);
+#else
+  using T = long long int;
+  Kokkos::atomic_fetch_sub(a, T(1));
+#endif
+}
+
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* a) {
+#if defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+  T* a_nv = const_cast<T*>(a);
+  --(*a_nv);
+#else
+  Kokkos::atomic_fetch_sub(a, T(1));
+#endif
+}
+
+} // End of namespace Kokkos
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..801a8091ddcba27ca09be73951cca8ae00e7d2ca
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -0,0 +1,415 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
+#define KOKKOS_ATOMIC_EXCHANGE_HPP
+
+#if defined(KOKKOS_ENABLE_CUDA)
+#include<Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
+#endif
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_CUDA )
+#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+
+__inline__ __device__
+int atomic_exchange( volatile int * const dest , const int val )
+{
+  // return __iAtomicExch( (int*) dest , val );
+  return atomicExch( (int*) dest , val );
+}
+
+__inline__ __device__
+unsigned int atomic_exchange( volatile unsigned int * const dest , const unsigned int val )
+{
+  // return __uAtomicExch( (unsigned int*) dest , val );
+  return atomicExch( (unsigned int*) dest , val );
+}
+
+__inline__ __device__
+unsigned long long int atomic_exchange( volatile unsigned long long int * const dest , const unsigned long long int val )
+{
+  // return __ullAtomicExch( (unsigned long long*) dest , val );
+  return atomicExch( (unsigned long long*) dest , val );
+}
+
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
+  type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_exchange( volatile T * const dest ,
+    typename Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  int done = 0;
+  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
+  unsigned int done_active = 0;
+  while (active!=done_active) {
+    if(!done) {
+      if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+        return_val = *dest;
+        *dest = val;
+        Impl::unlock_address_cuda_space( (void*) dest );
+        done = 1;
+      }
+    }
+    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
+  }
+  return return_val;
+}
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // (void) __ullAtomicExch( (int*) dest , *((int*)&val) );
+  (void) atomicExch( ((int*)dest) , *((int*)&val) );
+}
+
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  // (void) __ullAtomicExch( (type*) dest , *((type*)&val) );
+  (void) atomicExch( ((type*)dest) , *((type*)&val) );
+}
+
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(unsigned long long int)
+                                  , const T & >::type val )
+{
+  (void) atomic_exchange(dest,val);
+}
+
+#endif
+#endif
+
+//----------------------------------------------------------------------------
+
+#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
+
+template< typename T >
+inline
+T atomic_exchange( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+
+  type assumed ;
+
+  union U {
+    T val_T ;
+    type val_type ;
+    inline U() {};
+  } old ;
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+
+  return old.val_T ;
+}
+
+#if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+template< typename T >
+inline
+T atomic_exchange( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
+                                  , const T & >::type val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    inline U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+  newval.t = val;
+
+  do {
+    assume.i = oldval.i ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_exchange( volatile T * const dest ,
+    typename Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  // Don't use the following line of code here:
+  //
+  //const T tmp = *dest = val;
+  //
+  // Instead, put each assignment in its own statement.  This is
+  // because the overload of T::operator= for volatile *this should
+  // return void, not volatile T&.  See Kokkos #177:
+  //
+  // https://github.com/kokkos/kokkos/issues/177
+  *dest = val;
+  const T tmp = *dest;
+  #ifndef KOKKOS_COMPILER_CLANG
+  (void) tmp;
+  #endif
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+
+template< typename T >
+inline
+void atomic_assign( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+
+  type assumed ;
+
+  union U {
+    T val_T ;
+    type val_type ;
+    inline U() {};
+  } old ;
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+}
+
+#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+template< typename T >
+inline
+void atomic_assign( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
+                                  , const T & >::type val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    inline U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+  newval.t = val;
+  do {
+    assume.i = oldval.i ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i);
+  } while ( assume.i != oldval.i );
+}
+#endif
+
+template < typename T >
+inline
+void atomic_assign( volatile T * const dest ,
+    typename Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  // This is likely an aggregate type with a defined
+  // 'volatile T & operator = ( const T & ) volatile'
+  // member.  The volatile return value implicitly defines a
+  // dereference that some compilers (gcc 4.7.2) warn is being ignored.
+  // Suppress warning by casting return to void.
+  //(void)( *dest = val );
+  *dest = val;
+
+  Impl::unlock_address_host_space( (void*) dest );
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
+
+template < typename T >
+inline
+T atomic_exchange( volatile T * const dest , const T val )
+{
+  T retval;
+//#pragma omp atomic capture
+  #pragma omp critical
+  {
+    retval = dest[0];
+    dest[0] = val;
+  }
+  return retval;
+}
+
+template < typename T >
+inline
+void atomic_assign( volatile T * const dest , const T val )
+{
+//#pragma omp atomic
+  #pragma omp critical
+  {
+    dest[0] = val;
+  }
+}
+
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+
+template < typename T >
+inline
+T atomic_exchange( volatile T * const dest_v , const T val )
+{
+  T* dest = const_cast<T*>(dest_v);
+  T retval = *dest;
+  *dest = val;
+  return retval;
+}
+
+template < typename T >
+inline
+void atomic_assign( volatile T * const dest_v , const T val )
+{
+  T* dest = const_cast<T*>(dest_v);
+  *dest = val;
+}
+
+#endif
+#endif
+} // namespace Kokkos
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..55521e0c53c762715725519292c57dc91f50f9f3
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@@ -0,0 +1,386 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
+#define KOKKOS_ATOMIC_FETCH_ADD_HPP
+
+#if defined(KOKKOS_ENABLE_CUDA)
+#include<Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
+#endif
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_CUDA )
+#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_add( volatile int * const dest , const int val )
+{ return atomicAdd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAdd((unsigned int*)dest,val); }
+
+__inline__ __device__
+unsigned long long int atomic_fetch_add( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAdd((unsigned long long int*)dest,val); }
+
+__inline__ __device__
+float atomic_fetch_add( volatile float * const dest , const float val )
+{ return atomicAdd((float*)dest,val); }
+
+#if ( 600 <= __CUDA_ARCH__ )
+__inline__ __device__
+double atomic_fetch_add( volatile double * const dest , const double val )
+{ return atomicAdd((double*)dest,val); }
+#endif
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union U {
+    int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union U {
+    unsigned long long int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+    typename Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  int done = 0;
+  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
+  unsigned int done_active = 0;
+  while (active!=done_active) {
+    if(!done) {
+      bool locked = Impl::lock_address_cuda_space( (void*) dest );
+      if( locked ) {
+        return_val = *dest;
+        *dest = return_val + val;
+        Impl::unlock_address_cuda_space( (void*) dest );
+        done = 1;
+      }
+    }
+    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
+  }
+  return return_val;
+}
+#endif
+#endif
+//----------------------------------------------------------------------------
+#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
+#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
+
+#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+inline
+int atomic_fetch_add( volatile int * dest , const int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) 
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  int original = val;
+
+  __asm__ __volatile__(
+  	"lock xadd %1, %0"
+        : "+m" (*dest), "+r" (original)
+        : "m" (*dest), "r" (original)
+        : "memory"
+        );
+
+  return original;
+}
+#else
+inline
+int atomic_fetch_add( volatile int * const dest , const int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest, val);
+}
+#endif
+
+inline
+long int atomic_fetch_add( volatile long int * const dest , const long int val )
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest,val);
+}
+
+#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
+
+inline
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest,val);
+}
+
+inline
+unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest,val);
+}
+
+#endif
+
+template < typename T >
+inline
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union U {
+    int i ;
+    T t ;
+    inline U() {};
+  } assume , oldval , newval ;
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+inline
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T >::type val )
+{
+  union U {
+    long i ;
+    T t ;
+    inline U() {};
+  } assume , oldval , newval ;
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+template < typename T >
+inline
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(long) &&
+                                    sizeof(T) == sizeof(Impl::cas128_t) , const T >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    inline U() {};
+  } assume , oldval , newval ;
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_fetch_add( volatile T * const dest ,
+    typename Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+
+  // Don't use the following line of code here:
+  //
+  //const T tmp = *dest = return_val + val;
+  //
+  // Instead, put each assignment in its own statement.  This is
+  // because the overload of T::operator= for volatile *this should
+  // return void, not volatile T&.  See Kokkos #177:
+  //
+  // https://github.com/kokkos/kokkos/issues/177
+  *dest = return_val + val;
+  const T tmp = *dest;
+  (void) tmp;
+  Impl::unlock_address_host_space( (void*) dest );
+
+  return return_val;
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
+
+template< typename T >
+T atomic_fetch_add( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] += val;
+  }
+  return retval;
+}
+
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+
+template< typename T >
+T atomic_fetch_add( volatile T * const dest_v , const T val )
+{
+  T* dest = const_cast<T*>(dest_v);
+  T retval = *dest;
+  *dest += val;
+  return retval;
+}
+
+#endif
+#endif
+#endif // !defined ROCM_ATOMICS
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_add without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T * const dest, const T src) {
+  atomic_fetch_add(dest,src);
+}
+
+}
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..044cbdf79ae74fe304de8fea159c7a18188289f0
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
+#define KOKKOS_ATOMIC_FETCH_AND_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_CUDA )
+#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_and( volatile int * const dest , const int val )
+{ return atomicAnd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAnd((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_and( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAnd((unsigned long long int*)dest,val); }
+#endif
+#endif
+#endif
+//----------------------------------------------------------------------------
+#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
+
+inline
+int atomic_fetch_and( volatile int * const dest , const int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}
+
+inline
+long int atomic_fetch_and( volatile long int * const dest , const long int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}
+
+#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
+
+inline
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}
+
+inline
+unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
+
+template< typename T >
+T atomic_fetch_and( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] &= val;
+  }
+  return retval;
+}
+
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+
+template< typename T >
+T atomic_fetch_and( volatile T * const dest_v , const T val )
+{
+  T* dest = const_cast<T*>(dest_v);
+  T retval = *dest;
+  *dest &= val;
+  return retval;
+}
+
+#endif
+#endif
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_and without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_and(volatile T * const dest, const T src) {
+  (void)atomic_fetch_and(dest,src);
+}
+
+}
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b8cbb1d8ce0777cd029fc70b93675e29e65e63a
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
+#define KOKKOS_ATOMIC_FETCH_OR_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_CUDA )
+#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_or( volatile int * const dest , const int val )
+{ return atomicOr((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicOr((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_or( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicOr((unsigned long long int*)dest,val); }
+#endif
+#endif
+#endif
+//----------------------------------------------------------------------------
+#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
+
+inline
+int atomic_fetch_or( volatile int * const dest , const int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}
+
+inline
+long int atomic_fetch_or( volatile long int * const dest , const long int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}
+
+#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
+
+inline
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}
+
+inline
+unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
+
+template< typename T >
+T atomic_fetch_or( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] |= val;
+  }
+  return retval;
+}
+
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+
+template< typename T >
+T atomic_fetch_or( volatile T * const dest_v , const T val )
+{
+  T* dest = const_cast<T*>(dest_v);
+  T retval = *dest;
+  *dest |= val;
+  return retval;
+}
+
+#endif
+#endif
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_or without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_or(volatile T * const dest, const T src) {
+  (void)atomic_fetch_or(dest,src);
+}
+
+}
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..28aca0aeed25090ec2c6bb7655c9ead3757b841d
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@@ -0,0 +1,295 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
+#define KOKKOS_ATOMIC_FETCH_SUB_HPP
+
+#if defined(KOKKOS_ENABLE_CUDA)
+#include<Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
+#endif
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_CUDA )
+#if defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_sub( volatile int * const dest , const int val )
+{ return atomicSub((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicSub((unsigned int*)dest,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+    typename Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  int done = 0;
+  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
+  unsigned int done_active = 0;
+  while (active!=done_active) {
+    if(!done) {
+      if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+        return_val = *dest;
+        *dest = return_val - val;
+        Impl::unlock_address_cuda_space( (void*) dest );
+        done = 1;
+      }
+    }
+    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
+  }
+  return return_val;
+}
+#endif
+#endif
+//----------------------------------------------------------------------------
+#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
+#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
+#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
+
+inline
+int atomic_fetch_sub( volatile int * const dest , const int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}
+
+inline
+long int atomic_fetch_sub( volatile long int * const dest , const long int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}
+
+#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
+
+inline
+unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}
+
+inline
+unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}
+
+#endif
+
+template < typename T >
+inline
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } assume , oldval , newval ;
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+inline
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T >::type val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  union { long i ; T t ; } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_fetch_sub( volatile T * const dest ,
+    typename Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  *dest = return_val - val;
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
+
+template< typename T >
+T atomic_fetch_sub( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] -= val;
+  }
+  return retval;
+}
+
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+
+template< typename T >
+T atomic_fetch_sub( volatile T * const dest_v , const T val )
+{
+  T* dest = const_cast<T*>(dest_v);
+  T retval = *dest;
+  *dest -= val;
+  return retval;
+}
+
+#endif
+#endif
+#endif // !defined ROCM_ATOMICS
+
+// Simpler version of atomic_fetch_sub without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_sub(volatile T * const dest, const T src) {
+  atomic_fetch_sub(dest,src);
+}
+
+}
+
+#include<impl/Kokkos_Atomic_Assembly.hpp>
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6140d45896f5041bfb2c31b76e5c192e1179c77f
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@@ -0,0 +1,439 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP )
+#define KOKKOS_ATOMIC_GENERIC_HPP
+#include <Kokkos_Macros.hpp>
+
+#if defined(KOKKOS_ENABLE_CUDA)
+#include<Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
+#endif
+
+// Combination operands to be used in an Compare and Exchange based atomic operation
+namespace Kokkos {
+namespace Impl {
+
+template<class Scalar1, class Scalar2>
+struct MaxOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 > val2 ? val1 : val2);
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct MinOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 < val2 ? val1 : val2);
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct AddOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1+val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct SubOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1-val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct MulOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1*val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct DivOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1/val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct ModOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1%val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct AndOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1&val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct OrOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1|val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct XorOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1^val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct LShiftOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1<<val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct RShiftOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1>>val2;
+  }
+};
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int), const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename Kokkos::Impl::enable_if<
+                ( sizeof(T) != 4 )
+             && ( sizeof(T) != 8 )
+          #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
+             && ( sizeof(T) != 16 )
+          #endif
+           , const T >::type val )
+{
+
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  *dest = Oper::apply(return_val, val);
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
+  // This is a way to (hopefully) avoid dead lock in a warp
+  T return_val;
+  int done = 0;
+  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
+  unsigned int done_active = 0;
+  while (active!=done_active) {
+    if(!done) {
+      if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+        return_val = *dest;
+        *dest = Oper::apply(return_val, val);;
+        Impl::unlock_address_cuda_space( (void*) dest );
+        done=1;
+      }
+    }
+    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
+  }
+  return return_val;
+#endif
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename Kokkos::Impl::enable_if<
+                ( sizeof(T) != 4 )
+             && ( sizeof(T) != 8 )
+          #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
+             && ( sizeof(T) != 16 )
+          #endif
+           , const T >::type& val )
+{
+
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = Oper::apply(*dest, val);
+  *dest = return_val;
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  int done = 0;
+  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
+  unsigned int done_active = 0;
+  while (active!=done_active) {
+    if(!done) {
+      if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+        return_val = Oper::apply(*dest, val);
+        *dest = return_val;
+        Impl::unlock_address_cuda_space( (void*) dest );
+        done=1;
+      }
+    }
+    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
+  }
+  return return_val;
+#endif
+}
+
+}
+}
+
+namespace Kokkos {
+
+// Fetch_Oper atomics: return value before operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MaxOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MinOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MulOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::DivOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::ModOper<T,const T>(),dest,val);
+}
+
+#if !defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::AndOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::OrOper<T,const T>(),dest,val);
+}
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::XorOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(Impl::LShiftOper<T,const unsigned int>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(Impl::RShiftOper<T,const unsigned int>(),dest,val);
+}
+
+
+// Oper Fetch atomics: return value after operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MaxOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MinOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MulOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::DivOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::ModOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::AndOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::OrOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::XorOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(Impl::LShiftOper<T,const unsigned int>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val);
+}
+
+} // namespace Kokkos
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1a8357b61313519865ca04a6c7d0624f26cfaec
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
@@ -0,0 +1,153 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP )
+#define KOKKOS_ATOMIC_INCREMENT_HPP
+
+namespace Kokkos {
+
+// Atomic increment
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<char>(volatile char* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+  __asm__ __volatile__(
+      "lock incb %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+  char* a_nv = const_cast<char*>(a);
+  ++(*a_nv);
+#else
+  Kokkos::atomic_fetch_add(a, char(1));
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<short>(volatile short* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+  __asm__ __volatile__(
+      "lock incw %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+  short* a_nv = const_cast<short*>(a);
+  ++(*a_nv);
+#else
+  Kokkos::atomic_fetch_add(a, short(1));
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<int>(volatile int* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+  __asm__ __volatile__(
+      "lock incl %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+  int* a_nv = const_cast<int*>(a);
+  ++(*a_nv);
+#else
+  Kokkos::atomic_fetch_add(a,int(1));
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<long long int>(volatile long long int* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+  __asm__ __volatile__(
+      "lock incq %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#elif defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+  long long int* a_nv = const_cast<long long int*>(a);
+  ++(*a_nv);
+#else
+  using T = long long int;
+  Kokkos::atomic_fetch_add(a,T(1));
+#endif
+}
+
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* a) {
+#if defined( KOKKOS_ENABLE_SERIAL_ATOMICS )
+  T* a_nv = const_cast<T*>(a);
+  ++(*a_nv);
+#else
+  Kokkos::atomic_fetch_add(a,T(1));
+#endif
+}
+
+} // End of namespace Kokkos
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f63f93060560bf43f94c7f71dde11e998becf48
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@@ -0,0 +1,431 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ATOMIC_VIEW_HPP
+#define KOKKOS_ATOMIC_VIEW_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>
+
+namespace Kokkos { namespace Impl {
+
+//The following tag is used to prevent an implicit call of the constructor when trying
+//to assign a literal 0 int ( = 0 );
+struct AtomicViewConstTag {};
+
+template<class ViewTraits>
+class AtomicDataElement {
+public:
+  typedef typename ViewTraits::value_type value_type;
+  typedef typename ViewTraits::const_value_type const_value_type;
+  typedef typename ViewTraits::non_const_value_type non_const_value_type;
+  volatile value_type* const ptr;
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicDataElement(value_type* ptr_, AtomicViewConstTag ):ptr(ptr_){}
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator = (const_value_type& val) const {
+    *ptr = val;
+    return val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator = (volatile const_value_type& val) const {
+    *ptr = val;
+    return val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void inc() const {
+    Kokkos::atomic_increment(ptr);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void dec() const {
+    Kokkos::atomic_decrement(ptr);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ++ () const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,non_const_value_type(1));
+    return tmp+1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -- () const {
+    const_value_type tmp = Kokkos::atomic_fetch_sub(ptr,non_const_value_type(1));
+    return tmp-1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ++ (int) const {
+    return Kokkos::atomic_fetch_add(ptr,non_const_value_type(1));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -- (int) const {
+    return Kokkos::atomic_fetch_sub(ptr,non_const_value_type(1));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator += (const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
+    return tmp+val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator += (volatile const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
+    return tmp+val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -= (const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_sub(ptr,val);
+    return tmp-val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -= (volatile const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_sub(ptr,val);
+    return tmp-val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator *= (const_value_type& val) const {
+    return Kokkos::atomic_mul_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator *= (volatile const_value_type& val) const {
+    return Kokkos::atomic_mul_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator /= (const_value_type& val) const {
+    return Kokkos::atomic_div_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator /= (volatile const_value_type& val) const {
+    return Kokkos::atomic_div_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator %= (const_value_type& val) const {
+    return Kokkos::atomic_mod_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator %= (volatile const_value_type& val) const {
+    return Kokkos::atomic_mod_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator &= (const_value_type& val) const {
+    return Kokkos::atomic_and_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator &= (volatile const_value_type& val) const {
+    return Kokkos::atomic_and_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^= (const_value_type& val) const {
+    return Kokkos::atomic_xor_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^= (volatile const_value_type& val) const {
+    return Kokkos::atomic_xor_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator |= (const_value_type& val) const {
+    return Kokkos::atomic_or_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator |= (volatile const_value_type& val) const {
+    return Kokkos::atomic_or_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator <<= (const_value_type& val) const {
+    return Kokkos::atomic_lshift_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator <<= (volatile const_value_type& val) const {
+    return Kokkos::atomic_lshift_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >>= (const_value_type& val) const {
+    return Kokkos::atomic_rshift_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >>= (volatile const_value_type& val) const {
+    return Kokkos::atomic_rshift_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator + (const_value_type& val) const {
+    return *ptr+val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator + (volatile const_value_type& val) const {
+    return *ptr+val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator - (const_value_type& val) const {
+    return *ptr-val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator - (volatile const_value_type& val) const {
+    return *ptr-val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator * (const_value_type& val) const {
+    return *ptr*val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator * (volatile const_value_type& val) const {
+    return *ptr*val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator / (const_value_type& val) const {
+    return *ptr/val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator / (volatile const_value_type& val) const {
+    return *ptr/val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator % (const_value_type& val) const {
+    return *ptr^val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator % (volatile const_value_type& val) const {
+    return *ptr^val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ! () const {
+    return !*ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator && (const_value_type& val) const {
+    return *ptr&&val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator && (volatile const_value_type& val) const {
+    return *ptr&&val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator || (const_value_type& val) const {
+    return *ptr|val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator || (volatile const_value_type& val) const {
+    return *ptr|val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator & (const_value_type& val) const {
+    return *ptr&val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator & (volatile const_value_type& val) const {
+    return *ptr&val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator | (const_value_type& val) const {
+    return *ptr|val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator | (volatile const_value_type& val) const {
+    return *ptr|val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^ (const_value_type& val) const {
+    return *ptr^val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^ (volatile const_value_type& val) const {
+    return *ptr^val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ~ () const {
+    return ~*ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator << (const unsigned int& val) const {
+    return *ptr<<val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator << (volatile const unsigned int& val) const {
+    return *ptr<<val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >> (const unsigned int& val) const {
+    return *ptr>>val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >> (volatile const unsigned int& val) const {
+    return *ptr>>val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (const_value_type& val) const {
+    return *ptr == val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (volatile const_value_type& val) const {
+    return *ptr == val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const_value_type& val) const {
+    return *ptr != val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (volatile const_value_type& val) const {
+    return *ptr != val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator >= (const_value_type& val) const {
+    return *ptr >= val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator >= (volatile const_value_type& val) const {
+    return *ptr >= val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator <= (const_value_type& val) const {
+    return *ptr <= val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator <= (volatile const_value_type& val) const {
+    return *ptr <= val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator < (const_value_type& val) const {
+    return *ptr < val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator < (volatile const_value_type& val) const {
+    return *ptr < val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator > (const_value_type& val) const {
+    return *ptr > val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator > (volatile const_value_type& val) const {
+    return *ptr > val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  operator const_value_type () const {
+    //return Kokkos::atomic_load(ptr);
+    return *ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  operator volatile non_const_value_type () volatile const {
+    //return Kokkos::atomic_load(ptr);
+    return *ptr;
+  }
+};
+
+template<class ViewTraits>
+class AtomicViewDataHandle {
+public:
+  typename ViewTraits::value_type* ptr;
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicViewDataHandle()
+    : ptr(NULL)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicViewDataHandle(typename ViewTraits::value_type* ptr_)
+    :ptr(ptr_)
+  {}
+
+  template<class iType>
+  KOKKOS_INLINE_FUNCTION
+  AtomicDataElement<ViewTraits> operator[] (const iType& i) const {
+    return AtomicDataElement<ViewTraits>(ptr+i,AtomicViewConstTag());
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  operator typename ViewTraits::value_type * () const { return ptr ; }
+
+};
+
+template<unsigned Size>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars;
+
+template<>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<4> {
+  typedef int type;
+};
+
+template<>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
+  typedef int64_t type;
+};
+
+}} // namespace Kokkos::Impl
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..eeec2d1f4b57fa7ac7a4b03008d921ebe7d6a1e8
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
@@ -0,0 +1,233 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ATOMIC_WINDOWS_HPP
+#define KOKKOS_ATOMIC_WINDOWS_HPP
+
+#ifdef _WIN32
+
+#define NOMINMAX
+#include <winsock2.h>
+#include <Windows.h>
+
+namespace Kokkos {
+  namespace Impl {
+    _declspec(align(16))
+    struct cas128_t
+    {
+      LONGLONG lower;
+      LONGLONG upper;
+      KOKKOS_INLINE_FUNCTION
+        bool operator != (const cas128_t& a) const {
+        return (lower != a.lower) || upper != a.upper;
+      }
+    };
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONG), const T & >::type val)
+  {
+    union U {
+      LONG i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp;
+
+    tmp.i = _InterlockedCompareExchange((LONG*)dest, *((LONG*)&val), *((LONG*)&compare));
+    return tmp.t;
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONGLONG), const T & >::type val)
+  {
+    union U {
+      LONGLONG i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp;
+
+    tmp.i = _InterlockedCompareExchange64((LONGLONG*)dest, *((LONGLONG*)&val), *((LONGLONG*)&compare));
+    return tmp.t;
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val)
+  {
+    union U {
+      Impl::cas128_t i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp, newval;
+    newval.t = val;
+    _InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, newval.i.lower, ((LONGLONG*)&compare));
+    tmp.t = dest;
+    return tmp.t;
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange_strong(volatile T * const dest, const T & compare, const T & val)
+  {
+    return atomic_compare_exchange(dest,compare,val);
+  }
+
+  template< typename T >
+  T atomic_fetch_or(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val | oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_and(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val & oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_add(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val + oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_sub(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val - oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_exchange(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      oldval = atomic_compare_exchange(dest, assume, val);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  void atomic_or(volatile T * const dest, const T val) {
+    atomic_fetch_or(dest, val);
+  }
+
+  template< typename T >
+  void atomic_and(volatile T * const dest, const T val) {
+    atomic_fetch_and(dest, val);
+  }
+
+  template< typename T >
+  void atomic_add(volatile T * const dest, const T val) {
+    atomic_fetch_add(dest, val);
+  }
+
+  template< typename T >
+  void atomic_sub(volatile T * const dest, const T val) {
+    atomic_fetch_sub(dest, val);
+  }
+
+  template< typename T >
+  void atomic_assign(volatile T * const dest, const T val) {
+    atomic_fetch_exchange(dest, val);
+  }
+
+  template< typename T >
+  T atomic_increment(volatile T * const dest) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = assume++;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+  }
+
+  template< typename T >
+  T atomic_decrement(volatile T * const dest) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = assume--;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+  }
+
+}
+#endif
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp b/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f461baea9d71ac1e728b752d5105a7ad1a3f3d3
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_BitOps.hpp
@@ -0,0 +1,180 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BITOPS_HPP
+#define KOKKOS_BITOPS_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cstdint>
+#include <climits>
+
+#ifdef KOKKOS_COMPILER_INTEL
+#include<immintrin.h>
+#endif
+
+#if defined( __HCC_ACCELERATOR__ )
+#include <hc.hpp>
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+/**\brief  Find first zero bit.
+ *
+ *  If none then return -1 ;
+ */
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_first_zero( unsigned i ) noexcept
+{
+  enum : unsigned { full = ~0u };
+
+#if defined( __CUDA_ARCH__ )
+  return full != i ? __ffs( ~i ) - 1 : -1 ;
+#elif defined( __HCC_ACCELERATOR__ )
+  return full != i ? (int)hc::__firstbit_u32_u32(~i) : -1 ;
+#elif defined( KOKKOS_COMPILER_INTEL )
+  return full != i ? _bit_scan_forward( ~i ) : -1 ;
+#elif defined( KOKKOS_COMPILER_IBM )
+  return full != i ? __cnttz4( ~i ) : -1 ;
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ;
+#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
+  return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
+#else
+  int offset = -1 ;
+  if ( full != i ) {
+    for ( offset = 0 ; i & ( 1 << offset ) ; ++offset );
+  }
+  return offset ;
+#endif
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_forward( unsigned i )
+{
+#if defined( __CUDA_ARCH__ )
+  return __ffs(i) - 1;
+#elif defined( __HCC_ACCELERATOR__ )
+  return  (int)hc::__firstbit_u32_u32(i);
+#elif defined( KOKKOS_COMPILER_INTEL )
+  return _bit_scan_forward(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return __cnttz4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return i ? _popcnt(~i & (i-1)) : -1;
+#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_ffs(i) - 1;
+#else
+  int offset = -1;
+  if ( i ) {
+    for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset );
+  }
+  return offset;
+#endif
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_reverse( unsigned i )
+{
+  enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
+#if defined( __CUDA_ARCH__ )
+  return shift - __clz(i);
+#elif defined( __HCC_ACCELERATOR__ )
+  return  (int)hc::__firstbit_u32_u32(i);
+#elif defined( KOKKOS_COMPILER_INTEL )
+  return _bit_scan_reverse(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return shift - __cntlz4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return i ? shift - _leadz32(i) : 0 ;
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return shift - __builtin_clz(i);
+#else
+  int offset = 0;
+  if ( i ) {
+    for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset );
+  }
+  return offset;
+#endif
+}
+
+/// Count the number of bits set.
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_count( unsigned i )
+{
+#if defined( __CUDA_ARCH__ )
+  return __popc(i);
+#elif defined( __HCC_ACCELERATOR__ )
+  return  (int)hc::__popcount_u32_b32(i);
+#elif defined ( __INTEL_COMPILER )
+  return _popcnt32(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return __popcnt4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return _popcnt(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_popcount(i);
+#else
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
+  i = i - ( ( i >> 1 ) & ~0u / 3u );                             // temp
+  i = ( i & ~0u / 15u * 3u ) + ( ( i >> 2 ) & ~0u / 15u * 3u );  // temp
+  i = ( i + ( i >> 4 ) ) & ~0u / 255u * 15u;                     // temp
+
+  // count
+  return (int)( ( i * ( ~0u / 255u ) ) >> ( sizeof(unsigned) - 1 ) * CHAR_BIT );
+#endif
+}
+
+KOKKOS_INLINE_FUNCTION
+unsigned integral_power_of_two_that_contains( const unsigned N )
+{
+  const unsigned i = Kokkos::Impl::bit_scan_reverse( N );
+  return ( (1u << i) < N ) ? i + 1 : i ;
+}
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif // KOKKOS_BITOPS_HPP
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b16a8a998ac385dd40c93174c7e017a8b855b40
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cerrno>
+
+namespace Kokkos {
+namespace Impl {
+
+//The following function (processors_per_node) is copied from here:
+// https://lists.gnu.org/archive/html/autoconf/2002-08/msg00126.html
+// Philip Willoughby
+
+int processors_per_node() {
+  int nprocs = -1;
+  int nprocs_max = -1;
+#ifdef _WIN32
+#ifndef _SC_NPROCESSORS_ONLN
+SYSTEM_INFO info;
+GetSystemInfo(&info);
+#define sysconf(a) info.dwNumberOfProcessors
+#define _SC_NPROCESSORS_ONLN
+#endif
+#endif
+#ifdef _SC_NPROCESSORS_ONLN
+  nprocs = sysconf(_SC_NPROCESSORS_ONLN);
+  if (nprocs < 1)
+  {
+    return -1;
+  }
+  nprocs_max = sysconf(_SC_NPROCESSORS_CONF);
+  if (nprocs_max < 1)
+  {
+    return -1;
+  }
+  return nprocs;
+#else
+  return -1;
+#endif
+}
+
+int mpi_ranks_per_node() {
+  char *str;
+  int ppn = 1;
+  //if ((str = getenv("SLURM_TASKS_PER_NODE"))) {
+  //  ppn = atoi(str);
+  //  if(ppn<=0) ppn = 1;
+  //}
+  if ((str = getenv("MV2_COMM_WORLD_LOCAL_SIZE"))) {
+    ppn = atoi(str);
+    if(ppn<=0) ppn = 1;
+  }
+  if ((str = getenv("OMPI_COMM_WORLD_LOCAL_SIZE"))) {
+    ppn = atoi(str);
+    if(ppn<=0) ppn = 1;
+  }
+  return ppn;
+}
+
+int mpi_local_rank_on_node() {
+  char *str;
+  int local_rank=0;
+  //if ((str = getenv("SLURM_LOCALID"))) {
+  //  local_rank = atoi(str);
+  //}
+  if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
+    local_rank = atoi(str);
+  }
+  if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
+    local_rank = atoi(str);
+  }
+  return local_rank;
+}
+
+}
+}
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b200b057dbc2bc65ea3b7c2e6721f9594f73751
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp
@@ -0,0 +1,51 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+namespace Kokkos {
+namespace Impl {
+
+int processors_per_node();
+int mpi_ranks_per_node();
+int mpi_local_rank_on_node();
+
+}
+}
diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..68ca3b48f61723a0f6c05d5036e04d97f2dcb3b7
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@@ -0,0 +1,110 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CLOCKTIC_HPP
+#define KOKKOS_CLOCKTIC_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+#include <chrono>
+
+namespace Kokkos {
+namespace Impl {
+
+/**\brief  Quick query of clock register tics
+ *
+ *  Primary use case is to, with low overhead,
+ *  obtain a integral value that consistently varies
+ *  across concurrent threads of execution within
+ *  a parallel algorithm.
+ *  This value is often used to "randomly" seed an
+ *  attempt to acquire an indexed resource (e.g., bit)
+ *  from an array of resources (e.g., bitset) such that
+ *  concurrent threads will have high likelihood of
+ *  having different index-seed values.
+ */
+KOKKOS_FORCEINLINE_FUNCTION
+uint64_t clock_tic(void) noexcept
+{
+#if defined( __CUDA_ARCH__ )
+
+  // Return value of 64-bit hi-res clock register.
+
+  return clock64();
+
+#elif defined(__HCC_ACCELERATOR__)
+    // Get clock register
+    return hc::__clock_u64();
+
+#elif defined( __i386__ ) || defined( __x86_64 )
+
+  // Return value of 64-bit hi-res clock register.
+
+  unsigned a = 0, d = 0;
+
+  __asm__ volatile( "rdtsc" : "=a" (a), "=d" (d) );
+
+  return ( (uint64_t) a ) | ( ( (uint64_t) d ) << 32 );
+
+#elif defined( __powerpc )     || defined( __powerpc__ ) || \
+      defined( __powerpc64__ ) || defined( __POWERPC__ ) || \
+      defined( __ppc__ )       || defined( __ppc64__ )
+
+  unsigned int cycles = 0;
+
+  asm volatile( "mftb %0" : "=r" (cycles) );
+
+  return (uint64_t) cycles;
+
+#else
+
+  return (uint64_t)
+    std::chrono::high_resolution_clock::now().time_since_epoch().count();
+
+#endif
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif // KOKKOS_CLOCKTIC_HPP
diff --git a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ca76c2ff72023a8112b1cfb9812fe977b857912d
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
@@ -0,0 +1,357 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CONCURRENTBITSET_HPP
+#define KOKKOS_CONCURRENTBITSET_HPP
+
+#include <stdint.h>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+#include <impl/Kokkos_ClockTic.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+struct concurrent_bitset {
+public:
+
+  // 32 bits per integer value
+
+  enum : uint32_t { bits_per_int_lg2  = 5 };
+  enum : uint32_t { bits_per_int_mask = ( 1 << bits_per_int_lg2 ) - 1 };
+
+  // Buffer is uint32_t[ buffer_bound ]
+  //   [ uint32_t { state_header | used_count } , uint32_t bits[*] ]
+  //
+  //  Maximum bit count is 33 million (1u<<25):
+  //
+  //  - Maximum bit set size occupies 1 Mbyte
+  //
+  //  - State header can occupy bits [30-26]
+  //    which can be the bit_count_lg2
+  //
+  //  - Accept at least 33 million concurrent calls to 'acquire'
+  //    before risking an overflow race condition on a full bitset.
+
+  enum : uint32_t { max_bit_count_lg2 = 25 };
+  enum : uint32_t { max_bit_count     = 1u << max_bit_count_lg2 };
+  enum : uint32_t { state_shift = 26 };
+  enum : uint32_t { state_used_mask   = ( 1 << state_shift ) - 1 };
+  enum : uint32_t { state_header_mask = uint32_t(0x001f) << state_shift };
+
+  KOKKOS_INLINE_FUNCTION static constexpr
+  uint32_t buffer_bound_lg2( uint32_t const bit_bound_lg2 ) noexcept
+    {
+      return bit_bound_lg2 <= max_bit_count_lg2
+           ? 1 + ( 1u << ( bit_bound_lg2 > bits_per_int_lg2
+                         ? bit_bound_lg2 - bits_per_int_lg2 : 0 ) )
+           : 0 ;
+    }
+
+  /**\brief  Initialize bitset buffer */
+  KOKKOS_INLINE_FUNCTION static constexpr
+  uint32_t buffer_bound( uint32_t const bit_bound ) noexcept
+    {
+      return bit_bound <= max_bit_count
+           ? 1 + ( bit_bound >> bits_per_int_lg2 ) +
+             ( bit_bound & bits_per_int_mask ? 1 : 0 )
+           : 0 ;
+    }
+
+  /**\brief  Claim any bit within the bitset bound.
+   *
+   *  Return : ( which_bit , bit_count )
+   *
+   *  if success then
+   *    bit_count is the atomic-count of claimed > 0
+   *    which_bit is the claimed bit >= 0
+   *  else if attempt failed due to filled buffer
+   *    bit_count == which_bit == -1
+   *  else if attempt failed due to non-matching state_header
+   *    bit_count == which_bit == -2
+   *  else if attempt failed due to max_bit_count_lg2 < bit_bound_lg2
+   *                             or invalid state_header
+   *                             or (1u << bit_bound_lg2) <= bit
+   *    bit_count == which_bit == -3
+   *  endif
+   *
+   *  Recommended to have hint
+   *    bit = Kokkos::Impl::clock_tic() & ((1u<<bit_bound_lg2) - 1)
+   */
+  KOKKOS_INLINE_FUNCTION static
+  Kokkos::pair<int,int>
+  acquire_bounded_lg2( uint32_t volatile * const buffer
+                     , uint32_t const bit_bound_lg2
+                     , uint32_t bit = 0                /* optional hint */
+                     , uint32_t const state_header = 0 /* optional header */
+                     ) noexcept
+    {
+      typedef Kokkos::pair<int,int> type ;
+
+      const uint32_t bit_bound  = 1 << bit_bound_lg2 ;
+      const uint32_t word_count = bit_bound >> bits_per_int_lg2 ;
+
+      if ( ( max_bit_count_lg2 < bit_bound_lg2 ) ||
+           ( state_header & ~state_header_mask ) ||
+           ( bit_bound < bit ) ) {
+        return type(-3,-3);
+      }
+
+      // Use potentially two fetch_add to avoid CAS loop.
+      // Could generate "racing" failure-to-acquire
+      // when is full at the atomic_fetch_add(+1)
+      // then a release occurs before the atomic_fetch_add(-1).
+
+      const uint32_t state = (uint32_t)
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , 1 );
+
+      const uint32_t state_error =
+        state_header != ( state & state_header_mask );
+
+      const uint32_t state_bit_used = state & state_used_mask ;
+
+      if ( state_error || ( bit_bound <= state_bit_used ) ) {
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
+        return state_error ? type(-2,-2) : type(-1,-1);
+      }
+
+      // Do not update bit until count is visible:
+
+      Kokkos::memory_fence();
+
+      // There is a zero bit available somewhere,
+      // now find the (first) available bit and set it.
+
+      while(1) {
+
+        const uint32_t word = bit >> bits_per_int_lg2 ;
+        const uint32_t mask = 1u << ( bit & bits_per_int_mask );
+        const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask);
+
+        if ( ! ( prev & mask ) ) {
+          // Successfully claimed 'result.first' by
+          // atomically setting that bit.
+          return type( bit , state_bit_used + 1 );
+        }
+
+        // Failed race to set the selected bit
+        // Find a new bit to try.
+
+        const int j = Kokkos::Impl::bit_first_zero( prev );
+
+        if ( 0 <= j ) {
+          bit = ( word << bits_per_int_lg2 ) | uint32_t(j);
+        }
+        else {
+          bit =
+            ( (word+1) < word_count ? ((word+1) << bits_per_int_lg2) : 0 )
+            | ( bit & bits_per_int_mask );
+        }
+      }
+    }
+
+  /**\brief  Claim any bit within the bitset bound.
+   *
+   *  Return : ( which_bit , bit_count )
+   *
+   *  if success then
+   *    bit_count is the atomic-count of claimed > 0
+   *    which_bit is the claimed bit >= 0
+   *  else if attempt failed due to filled buffer
+   *    bit_count == which_bit == -1
+   *  else if attempt failed due to non-matching state_header
+   *    bit_count == which_bit == -2
+   *  else if attempt failed due to max_bit_count_lg2 < bit_bound_lg2
+   *                             or invalid state_header
+   *                             or bit_bound <= bit
+   *    bit_count == which_bit == -3
+   *  endif
+   *
+   *  Recommended to have hint
+   *    bit = Kokkos::Impl::clock_tic() % bit_bound
+   */
+  KOKKOS_INLINE_FUNCTION static
+  Kokkos::pair<int,int>
+  acquire_bounded( uint32_t volatile * const buffer
+                 , uint32_t const bit_bound
+                 , uint32_t bit = 0                /* optional hint */
+                 , uint32_t const state_header = 0 /* optional header */
+                 ) noexcept
+    {
+      typedef Kokkos::pair<int,int> type ;
+
+      if ( ( max_bit_count < bit_bound ) ||
+           ( state_header & ~state_header_mask ) ||
+           ( bit_bound <= bit ) ) {
+        return type(-3,-3);
+      }
+
+      const uint32_t word_count = bit_bound >> bits_per_int_lg2 ;
+
+      // Use potentially two fetch_add to avoid CAS loop.
+      // Could generate "racing" failure-to-acquire
+      // when is full at the atomic_fetch_add(+1)
+      // then a release occurs before the atomic_fetch_add(-1).
+
+      const uint32_t state = (uint32_t)
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , 1 );
+
+      const uint32_t state_error =
+        state_header != ( state & state_header_mask );
+
+      const uint32_t state_bit_used = state & state_used_mask ;
+
+      if ( state_error || ( bit_bound <= state_bit_used ) ) {
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
+        return state_error ? type(-2,-2) : type(-1,-1);
+      }
+
+      // Do not update bit until count is visible:
+
+      Kokkos::memory_fence();
+
+      // There is a zero bit available somewhere,
+      // now find the (first) available bit and set it.
+
+      while(1) {
+
+        const uint32_t word = bit >> bits_per_int_lg2 ;
+        const uint32_t mask = 1u << ( bit & bits_per_int_mask );
+        const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask);
+
+        if ( ! ( prev & mask ) ) {
+          // Successfully claimed 'result.first' by
+          // atomically setting that bit.
+          return type( bit , state_bit_used + 1 );
+        }
+
+        // Failed race to set the selected bit
+        // Find a new bit to try.
+
+        const int j = Kokkos::Impl::bit_first_zero( prev );
+
+        if ( 0 <= j ) {
+          bit = (word << bits_per_int_lg2 ) | uint32_t(j);
+        }
+
+        if ( ( j < 0 ) || ( bit_bound <= bit ) ) {
+          bit =
+            ( (word+1) < word_count ? ((word+1) << bits_per_int_lg2) : 0 )
+            | ( bit & bits_per_int_mask );
+        }
+      }
+    }
+
+  /**\brief
+   *
+   *  Requires: 'bit' previously acquired and has not yet been released.
+   *
+   *  Returns:
+   *    0 <= used count after successful release
+   *    -1 bit was already released
+   *    -2 state_header error
+   */
+  KOKKOS_INLINE_FUNCTION static
+  int release( uint32_t volatile * const buffer
+             , uint32_t const bit
+             , uint32_t const state_header = 0 /* optional header */
+             ) noexcept
+    {
+      if ( state_header != ( state_header_mask & *buffer ) ) { return -2 ; }
+
+      const uint32_t mask = 1u << ( bit & bits_per_int_mask );
+      const uint32_t prev =
+        Kokkos::atomic_fetch_and( buffer + ( bit >> bits_per_int_lg2 ) + 1
+                                , ~mask
+                                );
+
+      if ( ! ( prev & mask ) ) { return -1 ; }
+
+      // Do not update count until bit clear is visible
+      Kokkos::memory_fence();
+
+      const int count =
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
+
+      return ( count & state_used_mask ) - 1 ;
+    }
+
+  /**\brief
+   *
+   *  Requires: Bit within bounds and not already set.
+   *
+   *  Returns:
+   *    0 <= used count after successful release
+   *    -1 bit was already released
+   *    -2 bit or state_header error
+   */
+  KOKKOS_INLINE_FUNCTION static
+  int set( uint32_t volatile * const buffer
+         , uint32_t const bit
+         , uint32_t const state_header = 0 /* optional header */
+         ) noexcept
+    {
+      if ( state_header != ( state_header_mask & *buffer ) ) { return -2 ; }
+
+      const uint32_t mask = 1u << ( bit & bits_per_int_mask );
+      const uint32_t prev =
+        Kokkos::atomic_fetch_or( buffer + ( bit >> bits_per_int_lg2 ) + 1
+                               , mask
+                               );
+
+      if ( ! ( prev & mask ) ) { return -1 ; }
+
+      // Do not update count until bit clear is visible
+      Kokkos::memory_fence();
+
+      const int count =
+        Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
+
+      return ( count & state_used_mask ) - 1 ;
+    }
+};
+
+}} // namespace Kokkos::Impl
+
+#endif /* #ifndef KOKKOS_CONCURRENTBITSET_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a0ef8dcacb2c58bfeb002379fa02cfbb1a8890ad
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -0,0 +1,839 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <cctype>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <cstdlib>
+#include <stack>
+
+//----------------------------------------------------------------------------
+
+namespace {
+bool g_is_initialized = false;
+bool g_show_warnings = true;
+std::stack<std::function<void()> > finalize_hooks;
+}
+
+namespace Kokkos { namespace Impl { namespace {
+
+bool is_unsigned_int(const char* str)
+{
+  const size_t len = strlen (str);
+  for (size_t i = 0; i < len; ++i) {
+    if (! isdigit (str[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void initialize_internal(const InitArguments& args)
+{
+// This is an experimental setting
+// For KNL in Flat mode this variable should be set, so that
+// memkind allocates high bandwidth memory correctly.
+#ifdef KOKKOS_ENABLE_HBWSPACE
+setenv("MEMKIND_HBW_NODES", "1", 0);
+#endif
+
+  if (args.disable_warnings) {
+    g_show_warnings = false;
+  }
+
+  // Protect declarations, to prevent "unused variable" warnings.
+#if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS ) || defined( KOKKOS_ENABLE_OPENMPTARGET )
+  const int num_threads = args.num_threads;
+  const int use_numa = args.num_numa;
+#endif // defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS )
+#if defined( KOKKOS_ENABLE_CUDA ) || defined( KOKKOS_ENABLE_ROCM )
+  const int use_gpu = args.device_id;
+#endif // defined( KOKKOS_ENABLE_CUDA )
+
+#if defined( KOKKOS_ENABLE_OPENMP )
+  if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
+    if(use_numa>0) {
+      Kokkos::OpenMP::initialize(num_threads,use_numa);
+    }
+    else {
+      Kokkos::OpenMP::initialize(num_threads);
+    }
+  }
+  else {
+    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_THREADS )
+  if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
+    if(num_threads>0) {
+      if(use_numa>0) {
+        Kokkos::Threads::initialize(num_threads,use_numa);
+      }
+      else {
+        Kokkos::Threads::initialize(num_threads);
+      }
+    } else {
+      Kokkos::Threads::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: Pthread enabled and initialized" << std::endl ;
+  }
+  else {
+    //std::cout << "Kokkos::initialize() fyi: Pthread enabled but not initialized" << std::endl ;
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_SERIAL )
+  // Prevent "unused variable" warning for 'args' input struct.  If
+  // Serial::initialize() ever needs to take arguments from the input
+  // struct, you may remove this line of code.
+  (void) args;
+
+  // Always initialize Serial if it is configure time enabled
+  Kokkos::Serial::initialize();
+#endif
+
+#if defined( KOKKOS_ENABLE_OPENMPTARGET )
+  if( Impl::is_same< Kokkos::Experimental::OpenMPTarget , Kokkos::DefaultExecutionSpace >::value ) {
+    if(num_threads>0) {
+      if(use_numa>0) {
+        Kokkos::Experimental::OpenMPTarget::initialize(num_threads,use_numa);
+      }
+      else {
+        Kokkos::Experimental::OpenMPTarget::initialize(num_threads);
+      }
+    } else {
+      Kokkos::Experimental::OpenMPTarget::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
+  }
+  else {
+    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  if( std::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
+    if (use_gpu > -1) {
+      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( use_gpu ) );
+    }
+    else {
+      Kokkos::Cuda::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: Cuda enabled and initialized" << std::endl ;
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_ROCM )
+  if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
+    if (use_gpu > -1) {
+      Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice( use_gpu ) );
+    }
+    else {
+      Kokkos::Experimental::ROCm::initialize();
+    }
+    std::cout << "Kokkos::initialize() fyi: ROCm enabled and initialized" << std::endl ;
+  }
+#endif
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+#endif
+    g_is_initialized = true;
+}
+
+void finalize_internal( const bool all_spaces = false )
+{
+
+  typename decltype(finalize_hooks)::size_type  numSuccessfulCalls = 0;
+  while(! finalize_hooks.empty()) {
+    auto f = finalize_hooks.top();
+    try {
+      f();
+    }
+    catch(...) {
+      std::cerr << "Kokkos::finalize: A finalize hook (set via "
+        "Kokkos::push_finalize_hook) threw an exception that it did not catch."
+        "  Per std::atexit rules, this results in std::terminate.  This is "
+        "finalize hook number " << numSuccessfulCalls << " (1-based indexing) "
+        "out of " << finalize_hooks.size() << " to call.  Remember that "
+        "Kokkos::finalize calls finalize hooks in reverse order from how they "
+        "were pushed." << std::endl;
+      std::terminate();
+    }
+    finalize_hooks.pop();
+    ++numSuccessfulCalls;
+  }
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+#endif
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  if( std::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
+    if(Kokkos::Cuda::is_initialized())
+      Kokkos::Cuda::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_ROCM )
+  if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
+    if(Kokkos::Experimental::ROCm::is_initialized())
+      Kokkos::Experimental::ROCm::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_OPENMPTARGET )
+  if( std::is_same< Kokkos::Experimental::OpenMPTarget , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
+    if(Kokkos::Experimental::OpenMPTarget::is_initialized())
+      Kokkos::Experimental::OpenMPTarget::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_OPENMP )
+  if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ||
+      all_spaces ) {
+    if(Kokkos::OpenMP::is_initialized())
+      Kokkos::OpenMP::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_THREADS )
+  if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ||
+      all_spaces ) {
+    if(Kokkos::Threads::is_initialized())
+      Kokkos::Threads::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_SERIAL )
+  if(Kokkos::Serial::is_initialized())
+    Kokkos::Serial::finalize();
+#endif
+
+  g_is_initialized = false;
+  g_show_warnings = true;
+}
+
+void fence_internal()
+{
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  if( std::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value ) {
+    Kokkos::Cuda::fence();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_ROCM )
+  if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value ) {
+    Kokkos::Experimental::ROCm::fence();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_OPENMP )
+  if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::OpenMP::fence();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_THREADS )
+  if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::Threads::fence();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_SERIAL )
+  if( std::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
+      std::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::Serial::fence();
+  }
+#endif
+
+}
+
+bool check_arg(char const* arg, char const* expected) {
+  std::size_t arg_len = std::strlen(arg);
+  std::size_t exp_len = std::strlen(expected);
+  if (arg_len < exp_len) return false;
+  if (std::strncmp(arg, expected, exp_len) != 0) return false;
+  if (arg_len == exp_len) return true;
+  /* if expected is "--threads", ignore "--threads-for-application"
+     by checking this character          ---------^
+     to see if it continues to make a longer name */
+  if (std::isalnum(arg[exp_len]) || arg[exp_len] == '-' || arg[exp_len] == '_') {
+    return false;
+  }
+  return true;
+}
+
+bool check_int_arg(char const* arg, char const* expected, int* value) {
+  if (!check_arg(arg, expected)) return false;
+  std::size_t arg_len = std::strlen(arg);
+  std::size_t exp_len = std::strlen(expected);
+  bool okay = true;
+  if (arg_len == exp_len || arg[exp_len] != '=') okay = false;
+  char const* number = arg + exp_len + 1;
+  if (!Impl::is_unsigned_int(number) || strlen(number) == 0) okay = false;
+  *value = std::atoi(number);
+  if (!okay) {
+    std::ostringstream ss;
+    ss << "Error: expecting an '=INT' after command line argument '" << expected << "'";
+    ss << ". Raised by Kokkos::initialize(int narg, char* argc[]).";
+    Impl::throw_runtime_exception( ss.str() );
+  }
+  return true;
+}
+
+}}} // namespace Kokkos::Impl::{unnamed}
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+void initialize(int& narg, char* arg[])
+{
+    int num_threads = -1;
+    int numa = -1;
+    int device = -1;
+    bool disable_warnings = false;
+
+    int kokkos_threads_found = 0;
+    int kokkos_numa_found = 0;
+    int kokkos_device_found = 0;
+    int kokkos_ndevices_found = 0;
+
+    int iarg = 0;
+
+    while (iarg < narg) {
+      if (Impl::check_int_arg(arg[iarg], "--kokkos-threads", &num_threads)) {
+        for(int k=iarg;k<narg-1;k++) {
+          arg[k] = arg[k+1];
+        }
+        kokkos_threads_found=1;
+        narg--;
+      } else if (!kokkos_threads_found && Impl::check_int_arg(arg[iarg], "--threads", &num_threads)) {
+        iarg++;
+      } else if (Impl::check_int_arg(arg[iarg], "--kokkos-numa", &numa)) {
+        for(int k=iarg;k<narg-1;k++) {
+          arg[k] = arg[k+1];
+        }
+        kokkos_numa_found=1;
+        narg--;
+      } else if (!kokkos_numa_found && Impl::check_int_arg(arg[iarg], "--numa", &numa)) {
+        iarg++;
+      } else if (Impl::check_int_arg(arg[iarg], "--kokkos-device", &device)) {
+        for(int k=iarg;k<narg-1;k++) {
+          arg[k] = arg[k+1];
+        }
+        kokkos_device_found=1;
+        narg--;
+      } else if (!kokkos_device_found && Impl::check_int_arg(arg[iarg], "--device", &device)) {
+        iarg++;
+      } else if (Impl::check_arg(arg[iarg], "--kokkos-ndevices") || Impl::check_arg(arg[iarg], "--ndevices")) {
+
+        //Find the number of device (expecting --device=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-ndevices=",18) == 0) || (strncmp(arg[iarg],"--ndevices=",11) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT[,INT]' after command line argument '--ndevices/--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        int ndevices=-1;
+        int skip_device = 9999;
+
+        char* num1 = strchr(arg[iarg],'=')+1;
+        char* num2 = strpbrk(num1,",");
+        int num1_len = num2==NULL?strlen(num1):num2-num1;
+        char* num1_only = new char[num1_len+1];
+        strncpy(num1_only,num1,num1_len);
+        num1_only[num1_len]=0;
+
+        if(!Impl::is_unsigned_int(num1_only) || (strlen(num1_only)==0)) {
+          Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+        }
+        if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
+          ndevices = atoi(num1_only);
+        delete [] num1_only;
+
+        if( num2 != NULL ) {
+          if(( !Impl::is_unsigned_int(num2+1) ) || (strlen(num2)==1) )
+            Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices=XX,'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+          if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
+            skip_device = atoi(num2+1);
+        }
+
+        if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found) {
+          char *str;
+          //if ((str = getenv("SLURM_LOCALID"))) {
+          //  int local_rank = atoi(str);
+          //  device = local_rank % ndevices;
+          //  if (device >= skip_device) device++;
+          //}
+          if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
+            int local_rank = atoi(str);
+            device = local_rank % ndevices;
+            if (device >= skip_device) device++;
+          }
+          if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
+            int local_rank = atoi(str);
+            device = local_rank % ndevices;
+            if (device >= skip_device) device++;
+          }
+          if(device==-1) {
+            device = 0;
+            if (device >= skip_device) device++;
+          }
+        }
+
+        //Remove the --kokkos-ndevices argument from the list but leave --ndevices
+        if(strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_ndevices_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ( strcmp(arg[iarg],"--kokkos-disable-warnings") == 0) {
+        disable_warnings = true;
+        for(int k=iarg;k<narg-1;k++) {
+          arg[k] = arg[k+1];
+        }
+        narg--;
+      } else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) {
+         std::cout << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << "-------------Kokkos command line arguments--------------------------------------" << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << "The following arguments exist also without prefix 'kokkos' (e.g. --help)." << std::endl;
+         std::cout << "The prefixed arguments will be removed from the list by Kokkos::initialize()," << std::endl;
+         std::cout << "the non-prefixed ones are not removed. Prefixed versions take precedence over " << std::endl;
+         std::cout << "non prefixed ones, and the last occurrence of an argument overwrites prior" << std::endl;
+         std::cout << "settings." << std::endl;
+         std::cout << std::endl;
+         std::cout << "--kokkos-help               : print this message" << std::endl;
+         std::cout << "--kokkos-disable-warnings   : disable kokkos warning messages" << std::endl;
+         std::cout << "--kokkos-threads=INT        : specify total number of threads or" << std::endl;
+         std::cout << "                              number of threads per NUMA region if " << std::endl;
+         std::cout << "                              used in conjunction with '--numa' option. " << std::endl;
+         std::cout << "--kokkos-numa=INT           : specify number of NUMA regions used by process." << std::endl;
+         std::cout << "--kokkos-device=INT         : specify device id to be used by Kokkos. " << std::endl;
+         std::cout << "--kokkos-ndevices=INT[,INT] : used when running MPI jobs. Specify number of" << std::endl;
+         std::cout << "                              devices per node to be used. Process to device" << std::endl;
+         std::cout << "                              mapping happens by obtaining the local MPI rank" << std::endl;
+         std::cout << "                              and assigning devices round-robin. The optional" << std::endl;
+         std::cout << "                              second argument allows for an existing device" << std::endl;
+         std::cout << "                              to be ignored. This is most useful on workstations" << std::endl;
+         std::cout << "                              with multiple GPUs of which one is used to drive" << std::endl;
+         std::cout << "                              screen output." << std::endl;
+         std::cout << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << std::endl;
+
+         //Remove the --kokkos-help argument from the list but leave --ndevices
+         if(strcmp(arg[iarg],"--kokkos-help") == 0) {
+           for(int k=iarg;k<narg-1;k++) {
+             arg[k] = arg[k+1];
+           }
+           narg--;
+         } else {
+           iarg++;
+         }
+      } else
+      iarg++;
+    }
+
+    InitArguments arguments{num_threads, numa, device, disable_warnings};
+    Impl::initialize_internal(arguments);
+}
+
+void initialize(const InitArguments& arguments) {
+  Impl::initialize_internal(arguments);
+}
+
+void push_finalize_hook(std::function<void()> f)
+{
+  finalize_hooks.push(f);
+}
+
+void finalize()
+{
+  Impl::finalize_internal();
+}
+
+void finalize_all()
+{
+  enum { all_spaces = true };
+  Impl::finalize_internal( all_spaces );
+}
+
+void fence()
+{
+  Impl::fence_internal();
+}
+
+void print_configuration( std::ostream & out , const bool detail )
+{
+  std::ostringstream msg;
+
+  msg << "Compiler:" << std::endl;
+#ifdef KOKKOS_COMPILER_APPLECC
+  msg << "  KOKKOS_COMPILER_APPLECC: " << KOKKOS_COMPILER_APPLECC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_CLANG
+  msg << "  KOKKOS_COMPILER_CLANG: " << KOKKOS_COMPILER_CLANG << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_CRAYC
+  msg << "  KOKKOS_COMPILER_CRAYC: " << KOKKOS_COMPILER_CRAYC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_GNU
+  msg << "  KOKKOS_COMPILER_GNU: " << KOKKOS_COMPILER_GNU << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_IBM
+  msg << "  KOKKOS_COMPILER_IBM: " << KOKKOS_COMPILER_IBM << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_INTEL
+  msg << "  KOKKOS_COMPILER_INTEL: " << KOKKOS_COMPILER_INTEL << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_NVCC
+  msg << "  KOKKOS_COMPILER_NVCC: " << KOKKOS_COMPILER_NVCC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_PGI
+  msg << "  KOKKOS_COMPILER_PGI: " << KOKKOS_COMPILER_PGI << std::endl;
+#endif
+
+
+  msg << "Architecture:" << std::endl;
+#ifdef KOKKOS_ENABLE_ISA_KNC
+  msg << "  KOKKOS_ENABLE_ISA_KNC: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_KNC: no" << std::endl;
+#endif
+#ifdef KOKKOS_ENABLE_ISA_POWERPCLE
+  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: no" << std::endl;
+#endif
+#ifdef KOKKOS_ENABLE_ISA_X86_64
+  msg << "  KOKKOS_ENABLE_ISA_X86_64: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_X86_64: no" << std::endl;
+#endif
+
+
+  msg << "Devices:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA: ";
+#ifdef KOKKOS_ENABLE_CUDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_OPENMP: ";
+#ifdef KOKKOS_ENABLE_OPENMP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_THREADS: ";
+#ifdef KOKKOS_ENABLE_THREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_QTHREADS: ";
+#ifdef KOKKOS_ENABLE_QTHREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_SERIAL: ";
+#ifdef KOKKOS_ENABLE_SERIAL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Default Device:" << std::endl;
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Atomics:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_CUDA_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_GNU_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_GNU_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_INTEL_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_INTEL_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_OPENMP_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_WINDOWS_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_SERIAL_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_SERIAL_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+  msg << "Vectorization:" << std::endl;
+  msg << "  KOKKOS_ENABLE_PRAGMA_IVDEP: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_LOOPCOUNT: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_SIMD: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_SIMD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_UNROLL: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_VECTOR: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+  msg << "Memory:" << std::endl;
+  msg << "  KOKKOS_ENABLE_HBWSPACE: ";
+#ifdef KOKKOS_ENABLE_HBWSPACE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_INTEL_MM_ALLOC: ";
+#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_POSIX_MEMALIGN: ";
+#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Options:" << std::endl;
+  msg << "  KOKKOS_ENABLE_ASM: ";
+#ifdef KOKKOS_ENABLE_ASM
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CXX1Z: ";
+#ifdef KOKKOS_ENABLE_CXX1Z
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK: ";
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_HWLOC: ";
+#ifdef KOKKOS_ENABLE_HWLOC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_LIBRT: ";
+#ifdef KOKKOS_ENABLE_LIBRT
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_MPI: ";
+#ifdef KOKKOS_ENABLE_MPI
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PROFILING: ";
+#ifdef KOKKOS_ENABLE_PROFILING
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+  msg << "Cuda Options:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: ";
+#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: ";
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_UVM: ";
+#ifdef KOKKOS_ENABLE_CUDA_UVM
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUSPARSE: ";
+#ifdef KOKKOS_ENABLE_CUSPARSE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+#endif
+
+  msg << "\nRuntime Configuration:" << std::endl;
+#ifdef KOKKOS_ENABLE_CUDA
+  Cuda::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+  OpenMP::print_configuration(msg, detail);
+#endif
+#if defined( KOKKOS_ENABLE_THREADS )
+  Threads::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_QTHREADS
+  Qthreads::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_SERIAL
+  Serial::print_configuration(msg, detail);
+#endif
+
+  out << msg.str() << std::endl;
+}
+
+bool is_initialized() noexcept { return g_is_initialized; }
+
+bool show_warnings() noexcept { return g_show_warnings; }
+
+} // namespace Kokkos
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.cpp b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33b0ba918d3a4d63fb04edf559dc05ce9ef24424
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
@@ -0,0 +1,194 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+
+#include <ostream>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+#include <impl/Kokkos_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void host_abort( const char * const message )
+{
+  fwrite(message,1,strlen(message),stderr);
+  fflush(stderr);
+  ::abort();
+}
+
+void throw_runtime_exception( const std::string & msg )
+{
+  std::ostringstream o ;
+  o << msg ;
+  traceback_callstack( o );
+  throw std::runtime_error( o.str() );
+}
+
+
+std::string human_memory_size(size_t arg_bytes)
+{
+  double bytes = arg_bytes;
+  const double K = 1024;
+  const double M = K*1024;
+  const double G = M*1024;
+
+  std::ostringstream out;
+  if (bytes < K) {
+    out << std::setprecision(4) << bytes << " B";
+  } else if (bytes < M) {
+    bytes /= K;
+    out << std::setprecision(4) << bytes << " K";
+  } else if (bytes < G) {
+    bytes /= M;
+    out << std::setprecision(4) << bytes << " M";
+  } else {
+    bytes /= G;
+    out << std::setprecision(4) << bytes << " G";
+  }
+  return out.str();
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __GNUC__ ) && defined( ENABLE_TRACEBACK )
+
+/*  This is only known to work with GNU C++
+ *  Must be compiled with '-rdynamic'
+ *  Must be linked with   '-ldl'
+ */
+
+/* Print call stack into an error stream,
+ * so one knows in which function the error occured.
+ *
+ * Code copied from:
+ *   http://stupefydeveloper.blogspot.com/2008/10/cc-call-stack.html
+ *
+ * License on this site:
+ *   This blog is licensed under a
+ *   Creative Commons Attribution-Share Alike 3.0 Unported License.
+ *
+ *   http://creativecommons.org/licenses/by-sa/3.0/
+ *
+ * Modified to output to std::ostream.
+ */
+#include <signal.h>
+#include <execinfo.h>
+#include <cxxabi.h>
+#include <dlfcn.h>
+
+#include <cstdlib>
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  using namespace abi;
+
+  enum { MAX_DEPTH = 32 };
+
+  void *trace[MAX_DEPTH];
+  Dl_info dlinfo;
+
+  int status;
+
+  int trace_size = backtrace(trace, MAX_DEPTH);
+
+  msg << std::endl << "Call stack {" << std::endl ;
+
+  for (int i=1; i<trace_size; ++i)
+  {
+    if(!dladdr(trace[i], &dlinfo))
+        continue;
+
+    const char * symname = dlinfo.dli_sname;
+
+    char * demangled = __cxa_demangle(symname, NULL, 0, &status);
+
+    if ( status == 0 && demangled ) {
+      symname = demangled;
+    }
+
+    if ( symname && *symname != 0 ) {
+      msg << "  object: " << dlinfo.dli_fname
+          << " function: " << symname
+          << std::endl ;
+    }
+
+    if ( demangled ) {
+        free(demangled);
+    }
+  }
+  msg << "}" ;
+}
+
+}
+}
+
+#else
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  msg << std::endl << "Traceback functionality not available" << std::endl ;
+}
+
+}
+}
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.hpp b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f15c20fa70332f9485a1bc515c643f7cc0cbdffd
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
@@ -0,0 +1,90 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ERROR_HPP
+#define KOKKOS_IMPL_ERROR_HPP
+
+#include <string>
+#include <iosfwd>
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+void host_abort( const char * const );
+
+void throw_runtime_exception( const std::string & );
+
+void traceback_callstack( std::ostream & );
+
+std::string human_memory_size(size_t arg_bytes);
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+
+namespace Kokkos {
+KOKKOS_INLINE_FUNCTION
+void abort( const char * const message ) {
+#ifdef __CUDA_ARCH__
+  Kokkos::Impl::cuda_abort(message);
+#else
+  #if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(__HCC_ACCELERATOR__)
+    Kokkos::Impl::host_abort(message);
+  #endif
+#endif
+}
+
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72ba5156136e6ddc460d36d65af8a27e3317561d
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp
@@ -0,0 +1,63 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+namespace Kokkos {
+namespace Impl {
+    PerTeamValue::PerTeamValue(int arg):value(arg) {}
+
+    PerThreadValue::PerThreadValue(int arg):value(arg) {}
+}
+
+Impl::PerTeamValue PerTeam(const int& arg)
+{
+  return Impl::PerTeamValue(arg);
+}
+
+Impl::PerThreadValue PerThread(const int& arg)
+{
+  return Impl::PerThreadValue(arg);
+}
+
+}
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d4ffb85c124fe6b9249e7521f251e1a30692105
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@@ -0,0 +1,1767 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_FUNCTORADAPTER_HPP
+#define KOKKOS_FUNCTORADAPTER_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType, class Enable = void>
+struct ReduceFunctorHasInit {
+  enum {value = false};
+};
+
+template< class FunctorType>
+struct ReduceFunctorHasInit<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type > {
+  enum {value = true};
+};
+
+template< class FunctorType, class Enable = void>
+struct ReduceFunctorHasJoin {
+  enum {value = false};
+};
+
+template< class FunctorType>
+struct ReduceFunctorHasJoin<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type > {
+  enum {value = true};
+};
+
+template< class FunctorType, class Enable = void>
+struct ReduceFunctorHasFinal {
+  enum {value = false};
+};
+
+template< class FunctorType>
+struct ReduceFunctorHasFinal<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type > {
+  enum {value = true};
+};
+
+template< class FunctorType, class Enable = void>
+  struct ReduceFunctorHasShmemSize {
+  enum {value = false};
+};
+
+template< class FunctorType>
+struct ReduceFunctorHasShmemSize<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type > {
+  enum {value = true};
+};
+
+template< class FunctorType , class ArgTag , class Enable = void >
+struct FunctorDeclaresValueType : public Impl::false_type {};
+
+template< class FunctorType , class ArgTag >
+struct FunctorDeclaresValueType< FunctorType , ArgTag
+                               , typename Impl::enable_if_type< typename FunctorType::value_type >::type >
+  : public Impl::true_type {};
+
+template< class FunctorType, bool Enable =
+      ( FunctorDeclaresValueType<FunctorType,void>::value) ||
+      ( ReduceFunctorHasInit<FunctorType>::value  ) ||
+      ( ReduceFunctorHasJoin<FunctorType>::value  ) ||
+      ( ReduceFunctorHasFinal<FunctorType>::value ) ||
+      ( ReduceFunctorHasShmemSize<FunctorType>::value )
+      >
+struct IsNonTrivialReduceFunctor {
+  enum {value = false};
+};
+
+template< class FunctorType>
+struct IsNonTrivialReduceFunctor<FunctorType, true> {
+  enum {value = true};
+};
+
+/** \brief  Query Functor and execution policy argument tag for value type.
+ *
+ *  If C++11 enabled and 'value_type' is not explicitly declared then attempt
+ *  to deduce the type from FunctorType::operator().
+ */
+template< class FunctorType , class ArgTag , bool Dec = FunctorDeclaresValueType<FunctorType,ArgTag>::value >
+struct FunctorValueTraits
+{
+  typedef void value_type ;
+  typedef void pointer_type ;
+  typedef void reference_type ;
+  typedef void functor_type ;
+
+  enum { StaticValueSize = 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const FunctorType & ) { return 0 ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const FunctorType & ) { return 0 ; }
+};
+
+template<class ArgTag>
+struct FunctorValueTraits<void, ArgTag,false>
+{
+  typedef void value_type ;
+  typedef void pointer_type ;
+  typedef void reference_type ;
+  typedef void functor_type ;
+};
+
+/** \brief  FunctorType::value_type is explicitly declared so use it.
+ *
+ * Two options for declaration
+ *
+ *   1) A plain-old-data (POD) type
+ *        typedef {pod_type} value_type ;
+ *
+ *   2) An array of POD of a runtime specified count.
+ *        typedef {pod_type} value_type[] ;
+ *        const unsigned     value_count ;
+ */
+template< class FunctorType , class ArgTag >
+struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType::value_type */ >
+{
+  typedef typename Impl::remove_extent< typename FunctorType::value_type >::type  value_type ;
+  typedef FunctorType functor_type;
+
+  static_assert( 0 == ( sizeof(value_type) % sizeof(int) ) ,
+    "Reduction functor's declared value_type requires: 0 == sizeof(value_type) % sizeof(int)" );
+
+  /* this cast to bool is needed for correctness by NVCC */
+  enum : bool { IsArray = static_cast<bool>(Impl::is_array< typename FunctorType::value_type >::value) };
+
+  // If not an array then what is the sizeof(value_type)
+  enum { StaticValueSize = IsArray ? 0 : sizeof(value_type) };
+
+  typedef value_type                 * pointer_type ;
+
+  // The reference_type for an array is 'value_type *'
+  // The reference_type for a single value is 'value_type &'
+
+  typedef typename Impl::if_c< IsArray , value_type *
+                                       , value_type & >::type  reference_type ;
+
+  // Number of values if single value
+  template< class F >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! IsArray , unsigned >::type
+    value_count( const F & ) { return 1 ; }
+
+  // Number of values if an array, protect via templating because 'f.value_count'
+  // will only exist when the functor declares the value_type to be an array.
+  template< class F >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  typename Impl::enable_if< std::is_same<F,FunctorType>::value && IsArray , unsigned >::type
+    value_count( const F & f ) { return f.value_count ; }
+
+  // Total size of the value
+  KOKKOS_INLINE_FUNCTION static
+  unsigned value_size( const FunctorType & f ) { return value_count( f ) * sizeof(value_type) ; }
+};
+
+
+template< class FunctorType , class ArgTag >
+struct FunctorValueTraits< FunctorType
+                         , ArgTag
+                         , false  /* == exists FunctorType::value_type */
+                         >
+{
+private:
+
+  struct VOIDTAG {};   // Allow declaration of non-matching operator() with void argument tag.
+  struct REJECTTAG {}; // Reject tagged operator() when using non-tagged execution policy.
+
+  typedef typename
+    Impl::if_c< std::is_same< ArgTag , void >::value , VOIDTAG , ArgTag >::type tag_type ;
+
+  //----------------------------------------
+  // parallel_for operator without a tag:
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+
+  //----------------------------------------
+  // parallel_for operator with a tag:
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember ) const ) {}
+
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & ) const ) {}
+
+
+  //----------------------------------------
+  // parallel_reduce operator without a tag:
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  //----------------------------------------
+  // parallel_reduce operator with a tag:
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , ArgMember , T & ) const ) {}
+
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , const ArgMember & , T & ) const ) {}
+
+  //----------------------------------------
+  // parallel_scan operator without a tag:
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , T & , const bool& ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , T & , const bool& ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , T & , const bool& ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , T & , const bool& ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , T & , const bool& ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , T & , const bool& ) const ) {}
+  //----------------------------------------
+  // parallel_scan operator with a tag:
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember& , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember& , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , T & , const bool& ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , T & , const bool& ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember& , T & , const bool& ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember& , T & , const bool& ) const ) {}
+  //----------------------------------------
+
+  typedef decltype( deduce_reduce_type( tag_type() , & FunctorType::operator() ) ) ValueType ;
+
+  enum { IS_VOID   = std::is_same<VOIDTAG  ,ValueType>::value };
+  enum { IS_REJECT = std::is_same<REJECTTAG,ValueType>::value };
+
+public:
+
+  typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType   >::type  value_type ;
+  typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType * >::type  pointer_type ;
+  typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType & >::type  reference_type ;
+  typedef FunctorType functor_type;
+
+  static_assert( IS_VOID || IS_REJECT || 0 == ( sizeof(ValueType) % sizeof(int) ) ,
+    "Reduction functor's value_type deduced from functor::operator() requires: 0 == sizeof(value_type) % sizeof(int)" );
+
+  enum { StaticValueSize = IS_VOID || IS_REJECT ? 0 : sizeof(ValueType) };
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const FunctorType & ) { return StaticValueSize ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const FunctorType & ) { return IS_VOID || IS_REJECT ? 0 : 1 ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** Function signatures for FunctorType::init function with a tag.
+ *  reference_type is 'value_type &' for scalar and 'value_type *' for array.
+ */
+template< class FunctorType , class ArgTag >
+struct FunctorValueInitFunction {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
+    reference_type ;
+
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (FunctorType::*)( ArgTag         , reference_type ) const );
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (FunctorType::*)( ArgTag const & , reference_type ) const );
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (             *)( ArgTag         , reference_type ) );
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (             *)( ArgTag const & , reference_type ) );
+
+};
+
+/** Function signatures for FunctorType::init function without a tag.
+ *  reference_type is 'value_type &' for scalar and 'value_type *' for array.
+ */
+template< class FunctorType >
+struct FunctorValueInitFunction< FunctorType , void > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::reference_type
+    reference_type ;
+
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (FunctorType::*)( reference_type ) const );
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (             *)( reference_type ) );
+};
+
+// Adapter for value initialization function.
+// If a proper FunctorType::init is declared then use it,
+// otherwise use default constructor.
+template< class FunctorType , class ArgTag
+        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type // FIXME Fix FunctorValueTraits for multi-dim operator
+        , class Enable = void >
+struct FunctorValueInit ;
+
+/* No 'init' function provided for single value */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueInit< FunctorType , ArgTag , T & , Enable >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & init( const FunctorType & , void * p )
+    { return *( new(p) T() ); };
+};
+
+/* No 'init' function provided for array value */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueInit< FunctorType , ArgTag , T * , Enable >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * init( const FunctorType & f , void * p )
+    {
+      const int n = FunctorValueTraits< FunctorType , ArgTag >::value_count(f);
+      for ( int i = 0 ; i < n ; ++i ) { new( ((T*)p) + i ) T(); }
+      return (T*)p ;
+    }
+};
+
+/* 'init' function provided for single value */
+template< class FunctorType , class T >
+struct FunctorValueInit
+  < FunctorType
+  , void
+  , T &
+    // First  substitution failure when FunctorType::init does not exist.
+    // Second substitution failure when FunctorType::init is not compatible.
+  , decltype( FunctorValueInitFunction< FunctorType , void >::enable_if( & FunctorType::init ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & init( const FunctorType & f , void * p )
+    { f.init( *((T*)p) ); return *((T*)p) ; }
+};
+
+/* 'init' function provided for array value */
+template< class FunctorType , class T >
+struct FunctorValueInit
+  < FunctorType
+  , void
+  , T *
+    // First  substitution failure when FunctorType::init does not exist.
+    // Second substitution failure when FunctorType::init is not compatible
+  , decltype( FunctorValueInitFunction< FunctorType , void >::enable_if( & FunctorType::init ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * init( const FunctorType & f , void * p )
+    { f.init( (T*)p ); return (T*)p ; }
+};
+
+/* 'init' function provided for single value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueInit
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::init does not exist.
+    // Second substitution failure when FunctorType::init is not compatible.
+  , decltype( FunctorValueInitFunction< FunctorType , ArgTag >::enable_if( & FunctorType::init ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & init( const FunctorType & f , void * p )
+    { f.init( ArgTag() , *((T*)p) ); return *((T*)p) ; }
+};
+
+/* 'init' function provided for array value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueInit
+  < FunctorType
+  , ArgTag
+  , T *
+    // First  substitution failure when FunctorType::init does not exist.
+    // Second substitution failure when FunctorType::init is not compatible
+  , decltype( FunctorValueInitFunction< FunctorType , ArgTag >::enable_if( & FunctorType::init ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * init( const FunctorType & f , void * p )
+    { f.init( ArgTag() , (T*)p ); return (T*)p ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Signatures for compatible FunctorType::join with tag and not an array
+template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize >
+struct FunctorValueJoinFunction {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  typedef       volatile value_type & vref_type ;
+  typedef const volatile value_type & cvref_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , vref_type , cvref_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , vref_type , cvref_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , vref_type , cvref_type ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , vref_type , cvref_type ) );
+};
+
+// Signatures for compatible FunctorType::join with tag and is an array
+template< class FunctorType , class ArgTag >
+struct FunctorValueJoinFunction< FunctorType , ArgTag , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  typedef       volatile value_type * vptr_type ;
+  typedef const volatile value_type * cvptr_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , vptr_type , cvptr_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , vptr_type , cvptr_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , vptr_type , cvptr_type ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , vptr_type , cvptr_type ) );
+};
+
+// Signatures for compatible FunctorType::join without tag and not an array
+template< class FunctorType >
+struct FunctorValueJoinFunction< FunctorType , void , false > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  typedef       volatile value_type & vref_type ;
+  typedef const volatile value_type & cvref_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( vref_type , cvref_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( vref_type , cvref_type ) );
+};
+
+// Signatures for compatible FunctorType::join without tag and is an array
+template< class FunctorType >
+struct FunctorValueJoinFunction< FunctorType , void , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  typedef       volatile value_type * vptr_type ;
+  typedef const volatile value_type * cvptr_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( vptr_type , cvptr_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( vptr_type , cvptr_type ) );
+};
+
+
+template< class FunctorType , class ArgTag
+        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
+        , class Enable = void >
+struct FunctorValueJoin ;
+
+/* No 'join' function provided, single value */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueJoin< FunctorType , ArgTag , T & , Enable >
+{
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& ){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+    {
+      *((volatile T*)lhs) += *((const volatile T*)rhs);
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T& lhs , const volatile T& rhs ) const
+    {
+      lhs += rhs;
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T& lhs , const T& rhs ) const
+    {
+      lhs += rhs;
+    }
+};
+
+/* No 'join' function provided, array of values */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueJoin< FunctorType , ArgTag , T * , Enable >
+{
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f_);
+
+      for ( int i = 0 ; i < n ; ++i ) { ((volatile T*)lhs)[i] += ((const volatile T*)rhs)[i]; }
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T* const lhs , const volatile T* const rhs ) const
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
+
+      for ( int i = 0 ; i < n ; ++i ) { lhs[i] += rhs[i]; }
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T* lhs , const T* rhs ) const
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
+
+      for ( int i = 0 ; i < n ; ++i ) { lhs[i] += rhs[i]; }
+    }
+};
+
+/* 'join' function provided, single value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::join does not exist.
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) )
+  >
+{
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f_.join( ArgTag() , *((volatile T *)lhs) , *((const volatile T *)rhs) );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T& lhs , const volatile T& rhs ) const
+    {
+      f.join( ArgTag() , lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T& lhs , const T& rhs ) const
+    {
+      f.join( ArgTag(), lhs , rhs );
+    }
+};
+
+/* 'join' function provided, no tag, single value */
+template< class FunctorType , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , void
+  , T &
+    // First  substitution failure when FunctorType::join does not exist.
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) )
+  >
+{
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f_.join( *((volatile T *)lhs) , *((const volatile T *)rhs) );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T& lhs , const volatile T& rhs ) const
+    {
+      f.join( lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T& lhs , const T& rhs ) const
+    {
+      f.join( lhs , rhs );
+    }
+};
+
+/* 'join' function provided for array value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , ArgTag
+  , T *
+    // First  substitution failure when FunctorType::join does not exist.
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) )
+  >
+{
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f_.join( ArgTag() , (volatile T *)lhs , (const volatile T *)rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T* const lhs , const volatile T* const rhs ) const
+    {
+      f.join( ArgTag() , lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T* lhs , const T* rhs ) const
+    {
+      f.join( ArgTag(), lhs , rhs );
+    }
+};
+
+/* 'join' function provided, no tag, array value */
+template< class FunctorType , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , void
+  , T *
+    // First  substitution failure when FunctorType::join does not exist.
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) )
+  >
+{
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f_.join( (volatile T *)lhs , (const volatile T *)rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( volatile T* const lhs , const volatile T* const rhs ) const
+    {
+      f.join( lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T* lhs , const T* rhs ) const
+    {
+      f.join( lhs , rhs );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+  template<typename ValueType, class JoinOp, class Enable = void>
+  struct JoinLambdaAdapter {
+    typedef ValueType value_type;
+    const JoinOp& lambda;
+    KOKKOS_INLINE_FUNCTION
+    JoinLambdaAdapter(const JoinOp& lambda_):lambda(lambda_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile value_type& dst, const volatile value_type& src) const {
+      lambda(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void join(value_type& dst, const value_type& src) const {
+      lambda(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (volatile value_type& dst, const volatile value_type& src) const {
+      lambda(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (value_type& dst, const value_type& src) const {
+      lambda(dst,src);
+    }
+  };
+
+  template<typename ValueType, class JoinOp>
+  struct JoinLambdaAdapter<ValueType, JoinOp, decltype( FunctorValueJoinFunction< JoinOp , void >::enable_if( & JoinOp::join ) )> {
+    typedef ValueType value_type;
+    typedef StaticAssertSame<ValueType,typename JoinOp::value_type> assert_value_types_match;
+    const JoinOp& lambda;
+    KOKKOS_INLINE_FUNCTION
+    JoinLambdaAdapter(const JoinOp& lambda_):lambda(lambda_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile value_type& dst, const volatile value_type& src) const {
+      lambda.join(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void join(value_type& dst, const value_type& src) const {
+      lambda.join(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (volatile value_type& dst, const volatile value_type& src) const {
+      lambda.join(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (value_type& dst, const value_type& src) const {
+      lambda.join(dst,src);
+    }
+  };
+
+  template<typename ValueType>
+  struct JoinAdd {
+    typedef ValueType value_type;
+
+    KOKKOS_INLINE_FUNCTION
+    JoinAdd() {}
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile value_type& dst, const volatile value_type& src) const {
+      dst+=src;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator() (value_type& dst, const value_type& src) const {
+      dst+=src;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator() (volatile value_type& dst, const volatile value_type& src) const {
+      dst+=src;
+    }
+  };
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ArgTag
+        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type >
+struct FunctorValueOps ;
+
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueOps< FunctorType , ArgTag , T & >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * pointer( T & r ) { return & r ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & reference( void * p ) { return *((T*)p); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void copy( const FunctorType & , void * const lhs , const void * const rhs )
+    { *((T*)lhs) = *((const T*)rhs); }
+};
+
+/* No 'join' function provided, array of values */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueOps< FunctorType , ArgTag , T * >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * pointer( T * p ) { return p ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * reference( void * p ) { return ((T*)p); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void copy( const FunctorType & f , void * const lhs , const void * const rhs )
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
+      for ( int i = 0 ; i < n ; ++i ) { ((T*)lhs)[i] = ((const T*)rhs)[i]; }
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Compatible functions for 'final' function and value_type not an array
+template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize >
+struct FunctorFinalFunction {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type & ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile & ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const & ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const volatile & ) );
+};
+
+// Compatible functions for 'final' function and value_type is an array
+template< class FunctorType , class ArgTag >
+struct FunctorFinalFunction< FunctorType , ArgTag , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type * ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile * ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const * ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const volatile * ) );
+};
+
+template< class FunctorType >
+struct FunctorFinalFunction< FunctorType , void , false > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type & ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( const value_type & ) );
+};
+
+template< class FunctorType >
+struct FunctorFinalFunction< FunctorType , void , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type * ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( const value_type * ) );
+};
+
+/* No 'final' function provided */
+template< class FunctorType , class ArgTag
+        , class ResultType = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
+        , class Enable = void >
+struct FunctorFinal
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( const FunctorType & , void * ) {}
+};
+
+/* 'final' function provided */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorFinal
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::final does not exist.
+    // Second substitution failure when enable_if( & Functor::final ) does not exist
+  , decltype( FunctorFinalFunction< FunctorType , ArgTag >::enable_if( & FunctorType::final ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( const FunctorType & f , void * p ) { f.final( *((T*)p) ); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( FunctorType & f , void * p ) { f.final( *((T*)p) ); }
+};
+
+/* 'final' function provided for array value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorFinal
+  < FunctorType
+  , ArgTag
+  , T *
+    // First  substitution failure when FunctorType::final does not exist.
+    // Second substitution failure when enable_if( & Functor::final ) does not exist
+  , decltype( FunctorFinalFunction< FunctorType , ArgTag >::enable_if( & FunctorType::final ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( const FunctorType & f , void * p ) { f.final( (T*)p ); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( FunctorType & f , void * p ) { f.final( (T*)p ); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ArgTag
+        , class ReferenceType = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type >
+struct FunctorApplyFunction {
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , ReferenceType ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , ReferenceType ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , ReferenceType ) );
+};
+
+template< class FunctorType , class ReferenceType >
+struct FunctorApplyFunction< FunctorType , void , ReferenceType > {
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ReferenceType ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ReferenceType ) );
+};
+
+template< class FunctorType >
+struct FunctorApplyFunction< FunctorType , void , void > {
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)() const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)() );
+};
+
+template< class FunctorType , class ArgTag , class ReferenceType
+        , class Enable = void >
+struct FunctorApply
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( const FunctorType & , void * ) {}
+};
+
+/* 'apply' function provided for void value */
+template< class FunctorType , class ArgTag >
+struct FunctorApply
+  < FunctorType
+  , ArgTag
+  , void
+    // First  substitution failure when FunctorType::apply does not exist.
+    // Second substitution failure when enable_if( & Functor::apply ) does not exist
+  , decltype( FunctorApplyFunction< FunctorType , ArgTag , void >::enable_if( & FunctorType::apply ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( FunctorType & f ) { f.apply(); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( const FunctorType & f ) { f.apply(); }
+};
+
+/* 'apply' function provided for single value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorApply
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::apply does not exist.
+    // Second substitution failure when enable_if( & Functor::apply ) does not exist
+  , decltype( FunctorApplyFunction< FunctorType , ArgTag >::enable_if( & FunctorType::apply ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( const FunctorType & f , void * p ) { f.apply( *((T*)p) ); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( FunctorType & f , void * p ) { f.apply( *((T*)p) ); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_FUNCTORADAPTER_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc4b2af1a20a7b73650093cacbb00300f9ff4a86
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@@ -0,0 +1,836 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_FUNCTORANALYSIS_HPP
+#define KOKKOS_FUNCTORANALYSIS_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct FunctorPatternInterface {
+  struct FOR {};
+  struct REDUCE {};
+  struct SCAN {};
+};
+
+/** \brief  Query Functor and execution policy argument tag for value type.
+ *
+ *  If 'value_type' is not explicitly declared in the functor
+ *  then attempt to deduce the type from FunctorType::operator()
+ *  interface used by the pattern and policy.
+ *
+ *  For the REDUCE pattern generate a Reducer and finalization function
+ *  derived from what is available within the functor.
+ */
+template< typename PatternInterface , class Policy , class Functor >
+struct FunctorAnalysis {
+private:
+
+  using FOR    = FunctorPatternInterface::FOR ;
+  using REDUCE = FunctorPatternInterface::REDUCE ;
+  using SCAN   = FunctorPatternInterface::SCAN ;
+
+  //----------------------------------------
+
+  struct VOID {};
+
+  template< typename P = Policy , typename = std::false_type >
+  struct has_work_tag
+    {
+      using type = void ;
+      using wtag = VOID ;
+    };
+
+  template< typename P >
+  struct has_work_tag
+    < P , typename std::is_same< typename P::work_tag , void >::type >
+    {
+      using type = typename P::work_tag ;
+      using wtag = typename P::work_tag ;
+    };
+
+  using Tag  = typename has_work_tag<>::type ;
+  using WTag = typename has_work_tag<>::wtag ;
+
+  //----------------------------------------
+  // Check for T::execution_space
+
+  template< typename T , typename = std::false_type >
+  struct has_execution_space { using type = void ; enum { value = false }; };
+
+  template< typename T >
+  struct has_execution_space
+    < T , typename std::is_same< typename T::execution_space , void >::type >
+  {
+    using type = typename T::execution_space ;
+    enum { value = true };
+  };
+
+  using policy_has_space  = has_execution_space< Policy > ;
+  using functor_has_space = has_execution_space< Functor > ;
+
+  static_assert( ! policy_has_space::value ||
+                 ! functor_has_space::value ||
+                 std::is_same< typename policy_has_space::type
+                             , typename functor_has_space::type >::value
+               , "Execution Policy and Functor execution space must match" );
+
+  //----------------------------------------
+  // Check for Functor::value_type, which is either a simple type T or T[]
+
+  template< typename F , typename = std::false_type >
+  struct has_value_type { using type = void ; };
+
+  template< typename F >
+  struct has_value_type
+    < F , typename std::is_same< typename F::value_type , void >::type >
+  {
+    using type = typename F::value_type ;
+
+    static_assert( ! std::is_reference< type >::value &&
+                   std::rank< type >::value <= 1 &&
+                   std::extent< type >::value == 0
+                 , "Kokkos Functor::value_type is T or T[]" );
+  };
+
+  //----------------------------------------
+  // If Functor::value_type does not exist then evaluate operator(),
+  // depending upon the pattern and whether the policy has a work tag,
+  // to determine the reduction or scan value_type.
+
+  template< typename F
+          , typename P = PatternInterface
+          , typename V = typename has_value_type<F>::type
+          , bool     T = std::is_same< Tag , void >::value
+          >
+  struct deduce_value_type { using type = V ; };
+
+  template< typename F >
+  struct deduce_value_type< F , REDUCE , void , true > {
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , M , M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , M , M , M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , M , M , M , M , M , M , M , A & ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , REDUCE , void , false > {
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , M , M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , M , M , M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , M , M , M , M , M , M , M , A & ) const );
+
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , M , M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , M , M , M , M , M , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , M , M , M , M , M , M , M , A & ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , SCAN , void , true > {
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , A & , I ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , SCAN , void , false > {
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , A & , I ) const );
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , A & , I ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  //----------------------------------------
+
+  using candidate_type = typename deduce_value_type< Functor >::type ;
+
+  enum { candidate_is_void  = std::is_same< candidate_type , void >::value
+       , candidate_is_array = std::rank< candidate_type >::value == 1 };
+
+  //----------------------------------------
+
+public:
+
+  using execution_space = typename std::conditional
+    < functor_has_space::value
+    , typename functor_has_space::type
+    , typename std::conditional
+      < policy_has_space::value
+      , typename policy_has_space::type
+      , Kokkos::DefaultExecutionSpace
+      >::type
+    >::type ;
+
+  using value_type = typename std::remove_extent< candidate_type >::type ;
+
+  static_assert( ! std::is_const< value_type >::value
+               , "Kokkos functor operator reduce argument cannot be const" );
+
+private:
+
+  // Stub to avoid defining a type 'void &'
+  using ValueType = typename
+    std::conditional< candidate_is_void , VOID , value_type >::type ;
+
+public:
+
+  using pointer_type = typename
+    std::conditional< candidate_is_void , void , ValueType * >::type ;
+
+  using reference_type = typename
+    std::conditional< candidate_is_array  , ValueType * , typename
+    std::conditional< ! candidate_is_void , ValueType & , void >
+    ::type >::type ;
+
+private:
+
+  template< bool IsArray , class FF >
+  KOKKOS_INLINE_FUNCTION static constexpr
+  typename std::enable_if< IsArray , unsigned >::type
+  get_length( FF const & f ) { return f.value_count ; }
+
+  template< bool IsArray , class FF >
+  KOKKOS_INLINE_FUNCTION static constexpr
+  typename std::enable_if< ! IsArray , unsigned >::type
+  get_length( FF const & ) { return candidate_is_void ? 0 : 1 ; }
+
+public:
+
+  enum { StaticValueSize = ! candidate_is_void &&
+                           ! candidate_is_array
+                         ? sizeof(ValueType) : 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr
+  unsigned value_count( const Functor & f )
+    { return FunctorAnalysis::template get_length< candidate_is_array >(f); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr
+  unsigned value_size( const Functor & f )
+    { return FunctorAnalysis::template get_length< candidate_is_array >(f) * sizeof(ValueType); }
+
+  //----------------------------------------
+
+  template< class Unknown >
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr
+  unsigned value_count( const Unknown & )
+    { return candidate_is_void ? 0 : 1  ; }
+
+  template< class Unknown >
+  KOKKOS_FORCEINLINE_FUNCTION static constexpr
+  unsigned value_size( const Unknown & )
+    { return candidate_is_void ? 0 : sizeof(ValueType); }
+
+private:
+
+  enum INTERFACE : int
+    { DISABLE           = 0
+    , NO_TAG_NOT_ARRAY  = 1
+    , NO_TAG_IS_ARRAY   = 2
+    , HAS_TAG_NOT_ARRAY = 3
+    , HAS_TAG_IS_ARRAY  = 4
+    , DEDUCED =
+       ! std::is_same< PatternInterface , REDUCE >::value ? DISABLE : (
+       std::is_same<Tag,void>::value
+         ? (candidate_is_array ? NO_TAG_IS_ARRAY  : NO_TAG_NOT_ARRAY)
+         : (candidate_is_array ? HAS_TAG_IS_ARRAY : HAS_TAG_NOT_ARRAY) )
+    };
+
+  //----------------------------------------
+  // parallel_reduce join operator
+
+  template< class F , INTERFACE >
+  struct has_join_function ;
+
+  template< class F >
+  struct has_join_function< F , NO_TAG_NOT_ARRAY >
+    {
+      typedef volatile       ValueType & vref_type ;
+      typedef volatile const ValueType & cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( vref_type , cvref_type ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const * const f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f->join( *dst , *src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , NO_TAG_IS_ARRAY >
+    {
+      typedef volatile       ValueType * vref_type ;
+      typedef volatile const ValueType * cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( vref_type , cvref_type ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const * const f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f->join( dst , src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      typedef volatile       ValueType & vref_type ;
+      typedef volatile const ValueType & cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const * const f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f->join( WTag() , *dst , *src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , HAS_TAG_IS_ARRAY >
+    {
+      typedef volatile       ValueType * vref_type ;
+      typedef volatile const ValueType * cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const * const f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f->join( WTag() , dst , src ); }
+    };
+
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceJoin
+    {
+      enum { value = false };
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const * const f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+       {
+         const int n = FunctorAnalysis::value_count( *f );
+         for ( int i = 0 ; i < n ; ++i ) dst[i] += src[i];
+       }
+    };
+
+  template< class F >
+  struct DeduceJoin< F , DISABLE , void >
+    {
+      enum { value = false };
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const * const
+               , ValueType volatile *
+               , ValueType volatile const * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceJoin< F , I ,
+    decltype( has_join_function<F,I>::enable_if( & F::join ) ) >
+    : public has_join_function<F,I>
+    { enum { value = true }; };
+
+  //----------------------------------------
+
+  template< class , INTERFACE >
+  struct has_init_function ;
+
+  template< class F >
+  struct has_init_function< F , NO_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType & ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const * const f , ValueType * dst )
+        { f->init( *dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , NO_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType * ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const * const f , ValueType * dst )
+        { f->init( dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType & ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType & ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const * const f , ValueType * dst )
+        { f->init( WTag(), *dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , HAS_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType * ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType * ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const * const f , ValueType * dst )
+        { f->init( WTag(), dst ); }
+    };
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceInit
+    {
+      enum { value = false };
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const * const , ValueType * dst ) { new(dst) ValueType(); }
+    };
+
+  template< class F >
+  struct DeduceInit< F , DISABLE , void >
+    {
+      enum { value = false };
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const * const , ValueType * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceInit< F , I ,
+    decltype( has_init_function<F,I>::enable_if( & F::init ) ) >
+    : public has_init_function<F,I>
+    { enum { value = true }; };
+
+  //----------------------------------------
+
+  template< class , INTERFACE >
+  struct has_final_function ;
+
+  // No tag, not array
+  template< class F >
+  struct has_final_function< F , NO_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType & ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const * const f , ValueType * dst )
+        { f->final( *dst ); }
+    };
+
+  // No tag, is array
+  template< class F >
+  struct has_final_function< F , NO_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType * ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const * const f , ValueType * dst )
+        { f->final( dst ); }
+    };
+
+  // Has tag, not array
+  template< class F >
+  struct has_final_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType & ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType & ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const * const f , ValueType * dst )
+        { f->final( WTag(), *dst ); }
+    };
+
+  // Has tag, is array
+  template< class F >
+  struct has_final_function< F , HAS_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType * ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType * ) const );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const * const f , ValueType * dst )
+        { f->final( WTag(), dst ); }
+    };
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceFinal
+    {
+      enum { value = false };
+
+      KOKKOS_INLINE_FUNCTION
+      static void final( F const * const , ValueType * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceFinal< F , I ,
+    decltype( has_final_function<F,I>::enable_if( & F::final ) ) >
+    : public has_final_function<F,I>
+    { enum { value = true }; };
+
+  //----------------------------------------
+
+  template< class F = Functor , typename = void >
+  struct DeduceTeamShmem
+    {
+      enum { value = false };
+
+      static size_t team_shmem_size( F const & , int ) { return 0 ; }
+    };
+
+  template< class F >
+  struct DeduceTeamShmem< F , typename std::enable_if< 0 < sizeof( & F::team_shmem_size ) >::type >
+    {
+      enum { value = true };
+
+      static size_t team_shmem_size( F const * const f , int team_size )
+        { return f->team_shmem_size( team_size ); }
+    };
+
+  template< class F >
+  struct DeduceTeamShmem< F , typename std::enable_if< 0 < sizeof( & F::shmem_size ) >::type >
+    {
+      enum { value = true };
+
+      static size_t team_shmem_size( F const * const f , int team_size )
+        { return f->shmem_size( team_size ); }
+    };
+
+  //----------------------------------------
+
+public:
+
+  inline static
+  size_t team_shmem_size( Functor const & f )
+    { return DeduceTeamShmem<>::team_shmem_size( f ); }
+
+  //----------------------------------------
+
+  enum { has_join_member_function  = DeduceJoin<>::value };
+  enum { has_init_member_function  = DeduceInit<>::value };
+  enum { has_final_member_function = DeduceFinal<>::value };
+
+
+  template< class MemorySpace = typename execution_space::memory_space >
+  struct Reducer
+  {
+  private:
+
+    Functor const * const m_functor ;
+    ValueType     * const m_result ;
+
+    template< bool IsArray >
+    KOKKOS_INLINE_FUNCTION constexpr
+    typename std::enable_if< IsArray , FunctorAnalysis::ValueType * >::type
+    ref() const noexcept { return m_result ; }
+
+    template< bool IsArray >
+    KOKKOS_INLINE_FUNCTION constexpr
+    typename std::enable_if< ! IsArray , FunctorAnalysis::ValueType & >::type
+    ref() const noexcept { return *m_result ; }
+
+    template< bool IsArray >
+    KOKKOS_INLINE_FUNCTION constexpr
+    typename std::enable_if< IsArray , int >::type
+    len() const noexcept { return m_functor->value_count ; }
+
+    template< bool IsArray >
+    KOKKOS_INLINE_FUNCTION constexpr
+    typename std::enable_if< ! IsArray , int >::type
+    len() const noexcept { return candidate_is_void ? 0 : 1 ; }
+
+  public:
+
+    using reducer        = Reducer ;
+    using value_type     = FunctorAnalysis::value_type ;
+    using memory_space   = MemorySpace ;
+    using reference_type = FunctorAnalysis::reference_type ;
+    using functor_type   = Functor ; // Adapts a functor
+
+    KOKKOS_INLINE_FUNCTION constexpr
+    value_type * data() const noexcept { return m_result ; }
+
+    KOKKOS_INLINE_FUNCTION constexpr
+    reference_type reference() const noexcept
+      { return Reducer::template ref< candidate_is_array >(); }
+
+    KOKKOS_INLINE_FUNCTION constexpr
+    int length() const noexcept
+      { return Reducer::template len< candidate_is_array >(); }
+
+    KOKKOS_INLINE_FUNCTION
+    void copy( ValueType * const dst
+             , ValueType const * const src ) const noexcept
+      { for ( int i = 0 ; i < Reducer::template len< candidate_is_array >() ; ++i ) dst[i] = src[i] ; }
+
+    KOKKOS_INLINE_FUNCTION
+    void join( ValueType volatile * dst
+             , ValueType volatile const * src ) const noexcept
+      { DeduceJoin<>::join( m_functor , dst , src ); }
+
+    KOKKOS_INLINE_FUNCTION 
+    void init( ValueType * dst ) const noexcept
+      { DeduceInit<>::init( m_functor , dst ); }
+
+    KOKKOS_INLINE_FUNCTION
+    void final( ValueType * dst ) const noexcept
+      { DeduceFinal<>::final( m_functor , dst ); }
+
+    Reducer( Reducer const & ) = default ;
+    Reducer( Reducer && ) = default ;
+    Reducer & operator = ( Reducer const & ) = delete ;
+    Reducer & operator = ( Reducer && ) = delete ;
+
+    template< class S >
+    using rebind = Reducer< S > ;
+
+    KOKKOS_INLINE_FUNCTION explicit constexpr
+    Reducer( Functor const * arg_functor = 0
+           , ValueType * arg_value = 0 ) noexcept
+      : m_functor(arg_functor), m_result(arg_value) {}
+  };
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_FUNCTORANALYSIS_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..160ac5a851dcaebe8c0f53d5015ee613da0a3ba9
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -0,0 +1,339 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <Kokkos_Macros.hpp>
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstdint>
+#include <cstring>
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+#include <algorithm>
+
+#include <Kokkos_HBWSpace.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Atomic.hpp>
+#ifdef KOKKOS_ENABLE_HBWSPACE
+#include <memkind.h>
+#endif
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+#ifdef KOKKOS_ENABLE_HBWSPACE
+#define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB)
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+
+/* Default allocation mechanism */
+HBWSpace::HBWSpace()
+  : m_alloc_mech(
+     HBWSpace::STD_MALLOC
+    )
+{
+printf("Init\n");
+setenv("MEMKIND_HBW_NODES", "1", 0);
+}
+
+/* Default allocation mechanism */
+HBWSpace::HBWSpace( const HBWSpace::AllocationMechanism & arg_alloc_mech )
+  : m_alloc_mech( HBWSpace::STD_MALLOC )
+{
+printf("Init2\n");
+setenv("MEMKIND_HBW_NODES", "1", 0);
+  if ( arg_alloc_mech == STD_MALLOC ) {
+    m_alloc_mech = HBWSpace::STD_MALLOC ;
+  }
+}
+
+void * HBWSpace::allocate( const size_t arg_alloc_size ) const
+{
+  static_assert( sizeof(void*) == sizeof(uintptr_t)
+               , "Error sizeof(void*) != sizeof(uintptr_t)" );
+
+  static_assert( Kokkos::Impl::power_of_two< Kokkos::Impl::MEMORY_ALIGNMENT >::value
+               , "Memory alignment must be power of two" );
+
+  constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT ;
+  constexpr uintptr_t alignment_mask = alignment - 1 ;
+
+  void * ptr = 0 ;
+
+  if ( arg_alloc_size ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      // Over-allocate to and round up to guarantee proper alignment.
+      size_t size_padded = arg_alloc_size + sizeof(void*) + alignment ;
+
+      void * alloc_ptr = memkind_malloc(MEMKIND_TYPE, size_padded );
+
+      if (alloc_ptr) {
+        uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
+
+        // offset enough to record the alloc_ptr
+        address += sizeof(void *);
+        uintptr_t rem = address % alignment;
+        uintptr_t offset = rem ? (alignment - rem) : 0u;
+        address += offset;
+        ptr = reinterpret_cast<void *>(address);
+        // record the alloc'd pointer
+        address -= sizeof(void *);
+        *reinterpret_cast<void **>(address) = alloc_ptr;
+      }
+    }
+  }
+
+  if ( ( ptr == 0 ) || ( reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0) )
+       || ( reinterpret_cast<uintptr_t>(ptr) & alignment_mask ) ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::Experimental::HBWSpace::allocate[ " ;
+    switch( m_alloc_mech ) {
+    case STD_MALLOC: msg << "STD_MALLOC" ; break ;
+    }
+    msg << " ]( " << arg_alloc_size << " ) FAILED" ;
+    if ( ptr == NULL ) { msg << " NULL" ; }
+    else { msg << " NOT ALIGNED " << ptr ; }
+
+    std::cerr << msg.str() << std::endl ;
+    std::cerr.flush();
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return ptr;
+}
+
+
+void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
+{
+  if ( arg_alloc_ptr ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
+      memkind_free(MEMKIND_TYPE, alloc_ptr );
+    }
+
+  }
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::s_root_record ;
+
+void
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+~SharedAllocationRecord()
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::deallocateData(
+      Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::HBWSpace::name()),RecordBase::m_alloc_ptr->m_label,
+      data(),size());
+  }
+  #endif
+
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
+  }
+  #endif
+
+  // Fill in the Header information
+  RecordBase::m_alloc_ptr->m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+//----------------------------------------------------------------------------
+
+void * SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace,Kokkos::Experimental::HBWSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::get_record( void * alloc_ptr )
+{
+  typedef SharedAllocationHeader  Header ;
+  typedef SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >  RecordHost ;
+
+  SharedAllocationHeader const * const head   = alloc_ptr ? Header::get_header( alloc_ptr ) : (SharedAllocationHeader *)0 ;
+  RecordHost                   * const record = head ? static_cast< RecordHost * >( head->m_record ) : (RecordHost *) 0 ;
+
+  if ( ! alloc_ptr || record->m_alloc_ptr != head ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::get_record ERROR" ) );
+  }
+
+  return record ;
+}
+
+// Iterate records to print orphaned memory ...
+void SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+print_records( std::ostream & s , const Kokkos::Experimental::HBWSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HBWSpace" , & s_root_record , detail );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+namespace {
+  const unsigned HBW_SPACE_ATOMIC_MASK = 0xFFFF;
+  const unsigned HBW_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  static int HBW_SPACE_ATOMIC_LOCKS[HBW_SPACE_ATOMIC_MASK+1];
+}
+
+namespace Impl {
+void init_lock_array_hbw_space() {
+  static int is_initialized = 0;
+  if(! is_initialized)
+    for(int i = 0; i < static_cast<int> (HBW_SPACE_ATOMIC_MASK+1); i++)
+      HBW_SPACE_ATOMIC_LOCKS[i] = 0;
+}
+
+bool lock_address_hbw_space(void* ptr) {
+  return 0 == atomic_compare_exchange( &HBW_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HBW_SPACE_ATOMIC_MASK) ^ HBW_SPACE_ATOMIC_XOR_MASK] ,
+                                  0 , 1);
+}
+
+void unlock_address_hbw_space(void* ptr) {
+   atomic_exchange( &HBW_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HBW_SPACE_ATOMIC_MASK) ^ HBW_SPACE_ATOMIC_XOR_MASK] ,
+                    0);
+}
+
+}
+}
+}
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..987da1c6b38e0c799edd1526dee5ac192f4435ed
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.cpp
@@ -0,0 +1,204 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>
+
+#include <impl/Kokkos_HostBarrier.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
+
+#include <chrono>
+
+namespace Kokkos { namespace Impl {
+
+namespace {
+
+inline constexpr int length64( const int nthreads ) noexcept
+{
+  return (nthreads-1 + sizeof(uint64_t)-1) / sizeof(uint64_t);
+}
+
+} // namespace
+
+void rendezvous_initialize( volatile void * buffer
+                          , const int size
+                          , const int rank
+                          ) noexcept
+{
+  Kokkos::store_fence();
+
+  // ensure that the buffer has been zero'd out
+  constexpr uint8_t  zero8  = static_cast<uint8_t>(0);
+  constexpr uint64_t zero64 = static_cast<uint64_t>(0);
+
+  volatile uint64_t * header = reinterpret_cast<volatile uint64_t *>(buffer);
+
+  if (rank > 0) {
+    volatile uint8_t * bytes = reinterpret_cast<volatile uint8_t *>(buffer) + RENDEZVOUS_HEADER;
+
+    bytes[rank-1] = zero8;
+
+    // last thread is responsible for zeroing out the final bytes of the last uint64_t
+    if (rank == size-1) {
+      const int tmp  = (size-1) % sizeof(uint64_t);
+      const int rem = tmp ? sizeof(uint64_t) - tmp : 0;
+      for (int i=0; i<rem; ++i) {
+        bytes[rank+i] = zero8;
+      }
+    }
+
+    spinwait_until_equal( *header, zero64 );
+  }
+  else {
+
+    const int n = length64(size);
+    volatile uint64_t * buff = reinterpret_cast<volatile uint64_t *>(buffer) + RENDEZVOUS_HEADER/sizeof(uint64_t);
+
+    // wait for other threads to finish initializing
+    for (int i=0; i<n; ++i) {
+      root_spinwait_until_equal( buff[i], zero64 );
+    }
+
+    // release the waiting threads
+    *header = zero64;
+    Kokkos::store_fence();
+  }
+  Kokkos::load_fence();
+}
+
+bool rendezvous( volatile void * buffer
+               , uint64_t &      step
+               , const int       size
+               , const int       rank
+               , bool            active_wait
+               ) noexcept
+{
+  // Force all outstanding stores from this thread to retire before continuing
+  Kokkos::store_fence();
+
+  // guarantees that will never spinwait on a spin_value of 0
+  step = static_cast<uint8_t>(step + 1u)
+         ? step + 1u
+         : step + 2u
+         ;
+
+  // if size == 1, it is incorrect for rank 0 to check the tail value of the buffer
+  // this optimization prevents a potential read of uninitialized memory
+  if ( size == 1 ) { return true; }
+
+  const uint8_t byte_value  = static_cast<uint8_t>(step);
+
+  // byte that is set in the spin_value rotates every time
+  // this prevents threads from overtaking the master thread
+  const uint64_t spin_value = static_cast<uint64_t>(byte_value) << (byte_value&7);
+
+  if ( rank > 0 ) {
+    volatile uint64_t * header = reinterpret_cast<volatile uint64_t *>(buffer);
+    volatile uint8_t *  bytes  = reinterpret_cast<volatile uint8_t *>(buffer) + RENDEZVOUS_HEADER;
+
+    bytes[ rank-1 ] = byte_value;
+
+    if ( active_wait ) {
+      spinwait_until_equal( *header, spin_value );
+    }
+    else {
+      yield_until_equal( *header, spin_value );
+    }
+  }
+  else { // rank 0
+    volatile uint64_t * buff = reinterpret_cast<volatile uint64_t *>(buffer) + RENDEZVOUS_HEADER/sizeof(uint64_t);
+    const int n = length64(size);
+
+    uint64_t comp = byte_value;
+    comp = comp | (comp << 8);
+    comp = comp | (comp << 16);
+    comp = comp | (comp << 32);
+
+    const int rem  = (size-1) % sizeof(uint64_t);
+
+    union {
+      volatile uint64_t value;
+      volatile uint8_t  array[sizeof(uint64_t)];
+    } tmp{};
+
+    for (int i=0; i<rem; ++i) {
+      tmp.array[i] = byte_value;
+    }
+
+    const uint64_t tail = rem ? tmp.value : comp;
+
+    for (int i=0; i<n-1; ++i) {
+      root_spinwait_until_equal( buff[i], comp );
+    }
+    root_spinwait_until_equal( buff[n-1], tail );
+
+  }
+
+  // Force all outstanding stores from other threads to retire before allowing
+  // this thread to continue.  This forces correctness on systems with out-of-order
+  // memory (Power and ARM)
+  Kokkos::load_fence();
+
+  return rank == 0;
+}
+
+void rendezvous_release( volatile void * buffer
+                       , const uint64_t  step
+                       ) noexcept
+{
+  const uint8_t       byte_value = static_cast<uint8_t>(step);
+  const uint64_t      spin_value = static_cast<uint64_t>(byte_value) << (byte_value&7);
+  volatile uint64_t * header     = reinterpret_cast<volatile uint64_t *>(buffer);
+
+  // Force all outstanding stores from this thread to retire before releasing
+  // the other threads.  This forces correctness on systems with out-of-order
+  // memory (Power and ARM)
+  Kokkos::store_fence();
+
+  *header = spin_value;
+
+  Kokkos::memory_fence();
+}
+
+}} // namespace Kokkos::Impl
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostBarrier.hpp b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad527e5aee42227b496ae79980b3dfc30d0bacf6
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_HostBarrier.hpp
@@ -0,0 +1,146 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HOST_BARRIER_HPP
+#define KOKKOS_HOST_BARRIER_HPP
+
+#include <cstddef>
+#include <cstdint>
+
+namespace Kokkos { namespace Impl {
+
+//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+
+enum : int { RENDEZVOUS_ALIGNMENT = 128
+           , RENDEZVOUS_HEADER    = RENDEZVOUS_ALIGNMENT
+           };
+
+inline constexpr int rendezvous_buffer_size( const int nthreads ) noexcept
+{
+  return RENDEZVOUS_HEADER + ((nthreads-1 + RENDEZVOUS_ALIGNMENT-1) / RENDEZVOUS_ALIGNMENT) * RENDEZVOUS_ALIGNMENT;
+}
+
+void rendezvous_initialize( volatile void * buffer
+                          , const int       size
+                          , const int       rank
+                          ) noexcept;
+
+
+bool rendezvous( volatile void * buffer
+               , uint64_t &      step
+               , const int       size
+               , const int       rank
+               , bool            active_wait = true
+               ) noexcept;
+
+void rendezvous_release( volatile void * buffer
+                       , const uint64_t  step
+                       ) noexcept;
+
+
+//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+
+
+class HostBarrier
+{
+public:
+
+  enum : int { ALIGNMENT = RENDEZVOUS_ALIGNMENT };
+  enum : int { HEADER    = ALIGNMENT};
+
+  enum Policy : int { ACTIVE, PASSIVE };
+
+  inline static constexpr int buffer_size( const int nthreads ) noexcept
+  {
+    return rendezvous_buffer_size(nthreads);
+  }
+
+  HostBarrier( volatile void * arg_buffer
+             , int             arg_size
+             , int             arg_rank
+             , Policy          arg_policy
+             ) noexcept
+    : m_buffer{arg_buffer}
+    , m_size{arg_size}
+    , m_rank{arg_rank}
+    , m_policy{arg_policy}
+    , m_step{0}
+  {
+    rendezvous_initialize( m_buffer, m_size, m_rank );
+  }
+
+  bool rendezvous() const noexcept
+  {
+    return Kokkos::Impl::rendezvous( m_buffer
+                                   , m_step
+                                   , m_size
+                                   , m_rank
+                                   , m_policy == ACTIVE
+                                   );
+  }
+
+  void rendezvous_release() const noexcept
+  {
+    Kokkos::Impl::rendezvous_release( m_buffer, m_step );
+  }
+
+private:
+  volatile void *   m_buffer ;
+  const int         m_size   ;
+  const int         m_rank   ;
+  const Policy      m_policy ;
+  mutable uint64_t  m_step   ;
+
+private:
+  HostBarrier( const HostBarrier &  )             = delete;
+  HostBarrier(       HostBarrier && )             = delete;
+  HostBarrier & operator=( const HostBarrier &  ) = delete;
+  HostBarrier & operator=(       HostBarrier && ) = delete;
+};
+
+}} // namespace Kokkos::Impl
+
+#endif // KOKKOS_HOST_BARRIER_HPP
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96bced0bb780bc9f861ca9117f1b37a418a7a13b
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@@ -0,0 +1,490 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <algorithm>
+#include <Kokkos_Macros.hpp>
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( __INTEL_COMPILER ) && ! defined ( KOKKOS_ENABLE_CUDA )
+
+// Intel specialized allocator does not interoperate with CUDA memory allocation
+
+#define KOKKOS_ENABLE_INTEL_MM_ALLOC
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined(KOKKOS_ENABLE_POSIX_MEMALIGN)
+
+#include <unistd.h>
+#include <sys/mman.h>
+
+/* mmap flags for private anonymous memory allocation */
+
+#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
+  #define KOKKOS_IMPL_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
+#elif defined( MAP_ANON ) && defined( MAP_PRIVATE )
+  #define KOKKOS_IMPL_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
+#endif
+
+// mmap flags for huge page tables
+// the Cuda driver does not interoperate with MAP_HUGETLB
+#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
+  #if defined( MAP_HUGETLB ) && ! defined( KOKKOS_ENABLE_CUDA )
+    #define KOKKOS_IMPL_POSIX_MMAP_FLAGS_HUGE (KOKKOS_IMPL_POSIX_MMAP_FLAGS | MAP_HUGETLB )
+  #else
+    #define KOKKOS_IMPL_POSIX_MMAP_FLAGS_HUGE KOKKOS_IMPL_POSIX_MMAP_FLAGS
+  #endif
+#endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstdint>
+#include <cstring>
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Atomic.hpp>
+
+#if ( defined( KOKKOS_ENABLE_ASM ) || defined ( KOKKOS_ENABLE_TM ) ) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
+#include <immintrin.h>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/* Default allocation mechanism */
+HostSpace::HostSpace()
+  : m_alloc_mech(
+#if defined( KOKKOS_ENABLE_INTEL_MM_ALLOC )
+      HostSpace::INTEL_MM_ALLOC
+#elif defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
+      HostSpace::POSIX_MMAP
+#elif defined( KOKKOS_ENABLE_POSIX_MEMALIGN )
+      HostSpace::POSIX_MEMALIGN
+#else
+      HostSpace::STD_MALLOC
+#endif
+    )
+{}
+
+/* Default allocation mechanism */
+HostSpace::HostSpace( const HostSpace::AllocationMechanism & arg_alloc_mech )
+  : m_alloc_mech( HostSpace::STD_MALLOC )
+{
+  if ( arg_alloc_mech == STD_MALLOC ) {
+    m_alloc_mech = HostSpace::STD_MALLOC ;
+  }
+#if defined( KOKKOS_ENABLE_INTEL_MM_ALLOC )
+  else if ( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) {
+    m_alloc_mech = HostSpace::INTEL_MM_ALLOC ;
+  }
+#elif defined( KOKKOS_ENABLE_POSIX_MEMALIGN )
+  else if ( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) {
+    m_alloc_mech = HostSpace::POSIX_MEMALIGN ;
+  }
+#elif defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
+  else if ( arg_alloc_mech == HostSpace::POSIX_MMAP ) {
+    m_alloc_mech = HostSpace::POSIX_MMAP ;
+  }
+#endif
+  else {
+    const char * const mech =
+      ( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) ? "INTEL_MM_ALLOC" : (
+      ( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) ? "POSIX_MEMALIGN" : (
+      ( arg_alloc_mech == HostSpace::POSIX_MMAP     ) ? "POSIX_MMAP" : "" ));
+
+    std::string msg ;
+    msg.append("Kokkos::HostSpace ");
+    msg.append(mech);
+    msg.append(" is not available" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void * HostSpace::allocate( const size_t arg_alloc_size ) const
+{
+  static_assert( sizeof(void*) == sizeof(uintptr_t)
+               , "Error sizeof(void*) != sizeof(uintptr_t)" );
+
+  static_assert( Kokkos::Impl::is_integral_power_of_two( Kokkos::Impl::MEMORY_ALIGNMENT )
+               , "Memory alignment must be power of two" );
+
+  constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT ;
+  constexpr uintptr_t alignment_mask = alignment - 1 ;
+
+  void * ptr = 0 ;
+
+  if ( arg_alloc_size ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      // Over-allocate to and round up to guarantee proper alignment.
+      size_t size_padded = arg_alloc_size + sizeof(void*) + alignment ;
+
+      void * alloc_ptr = malloc( size_padded );
+
+      if (alloc_ptr) {
+        uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
+
+        // offset enough to record the alloc_ptr
+        address += sizeof(void *);
+        uintptr_t rem = address % alignment;
+        uintptr_t offset = rem ? (alignment - rem) : 0u;
+        address += offset;
+        ptr = reinterpret_cast<void *>(address);
+        // record the alloc'd pointer
+        address -= sizeof(void *);
+        *reinterpret_cast<void **>(address) = alloc_ptr;
+      }
+    }
+
+#if defined( KOKKOS_ENABLE_INTEL_MM_ALLOC )
+    else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
+      ptr = _mm_malloc( arg_alloc_size , alignment );
+    }
+#endif
+
+#if defined( KOKKOS_ENABLE_POSIX_MEMALIGN )
+    else if ( m_alloc_mech == POSIX_MEMALIGN ) {
+      posix_memalign( & ptr, alignment , arg_alloc_size );
+    }
+#endif
+
+#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
+    else if ( m_alloc_mech == POSIX_MMAP ) {
+      constexpr size_t use_huge_pages = (1u << 27);
+      constexpr int    prot  = PROT_READ | PROT_WRITE ;
+      const int flags = arg_alloc_size < use_huge_pages
+                      ? KOKKOS_IMPL_POSIX_MMAP_FLAGS
+                      : KOKKOS_IMPL_POSIX_MMAP_FLAGS_HUGE ;
+
+      // read write access to private memory
+
+      ptr = mmap( NULL /* address hint, if NULL OS kernel chooses address */
+                , arg_alloc_size /* size in bytes */
+                , prot           /* memory protection */
+                , flags          /* visibility of updates */
+                , -1             /* file descriptor */
+                ,  0             /* offset */
+                );
+
+/* Associated reallocation:
+       ptr = mremap( old_ptr , old_size , new_size , MREMAP_MAYMOVE );
+*/
+    }
+#endif
+  }
+
+  if ( ( ptr == 0 ) || ( reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0) )
+       || ( reinterpret_cast<uintptr_t>(ptr) & alignment_mask ) ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::HostSpace::allocate[ " ;
+    switch( m_alloc_mech ) {
+    case STD_MALLOC: msg << "STD_MALLOC" ; break ;
+    case POSIX_MEMALIGN: msg << "POSIX_MEMALIGN" ; break ;
+    case POSIX_MMAP: msg << "POSIX_MMAP" ; break ;
+    case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC" ; break ;
+    }
+    msg << " ]( " << arg_alloc_size << " ) FAILED" ;
+    if ( ptr == NULL ) { msg << " NULL" ; }
+    else { msg << " NOT ALIGNED " << ptr ; }
+
+    std::cerr << msg.str() << std::endl ;
+    std::cerr.flush();
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return ptr;
+}
+
+
+void HostSpace::deallocate( void * const arg_alloc_ptr
+    , const size_t
+#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
+    arg_alloc_size
+#endif
+    ) const
+{
+  if ( arg_alloc_ptr ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
+      free( alloc_ptr );
+    }
+
+#if defined( KOKKOS_ENABLE_INTEL_MM_ALLOC )
+    else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
+      _mm_free( arg_alloc_ptr );
+    }
+#endif
+
+#if defined( KOKKOS_ENABLE_POSIX_MEMALIGN )
+    else if ( m_alloc_mech == POSIX_MEMALIGN ) {
+      free( arg_alloc_ptr );
+    }
+#endif
+
+#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
+    else if ( m_alloc_mech == POSIX_MMAP ) {
+      munmap( arg_alloc_ptr , arg_alloc_size );
+    }
+#endif
+
+  }
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record ;
+
+void
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+~SharedAllocationRecord()
+{
+  #if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::deallocateData(
+      Kokkos::Profiling::SpaceHandle(Kokkos::HostSpace::name()),RecordBase::m_alloc_ptr->m_label,
+      data(),size());
+  }
+  #endif
+
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+SharedAllocationRecord( const Kokkos::HostSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+#if defined(KOKKOS_ENABLE_PROFILING)
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
+   }
+#endif
+  // Fill in the Header information
+  RecordBase::m_alloc_ptr->m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+//----------------------------------------------------------------------------
+
+void * SharedAllocationRecord< Kokkos::HostSpace , void >::
+allocate_tracked( const Kokkos::HostSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::HostSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::HostSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<HostSpace,HostSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void > *
+SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr )
+{
+  typedef SharedAllocationHeader  Header ;
+  typedef SharedAllocationRecord< Kokkos::HostSpace , void >  RecordHost ;
+
+  SharedAllocationHeader const * const head   = alloc_ptr ? Header::get_header( alloc_ptr ) : (SharedAllocationHeader *)0 ;
+  RecordHost                   * const record = head ? static_cast< RecordHost * >( head->m_record ) : (RecordHost *) 0 ;
+
+  if ( ! alloc_ptr || record->m_alloc_ptr != head ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , void >::get_record ERROR" ) );
+  }
+
+  return record ;
+}
+
+// Iterate records to print orphaned memory ...
+void SharedAllocationRecord< Kokkos::HostSpace , void >::
+print_records( std::ostream & s , const Kokkos::HostSpace & , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+  const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF;
+  const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1];
+}
+
+namespace Impl {
+void init_lock_array_host_space() {
+  static int is_initialized = 0;
+  if(! is_initialized)
+    for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++)
+      HOST_SPACE_ATOMIC_LOCKS[i] = 0;
+}
+
+bool lock_address_host_space(void* ptr) {
+#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
+  const unsigned status = _xbegin();
+
+  if( _XBEGIN_STARTED == status ) {
+	const int val = HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
+		HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK];
+
+	if( 0 == val ) {
+		HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
+                   HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] = 1;
+	} else {
+		_xabort( 1 );
+	}
+
+	_xend();
+
+	return 1;
+  } else {
+#endif
+  return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                                  0 , 1);
+#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
+  }
+#endif
+}
+
+void unlock_address_host_space(void* ptr) {
+#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
+  const unsigned status = _xbegin();
+
+  if( _XBEGIN_STARTED == status ) {
+	HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
+        	HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] = 0;
+  } else {
+#endif
+   atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                    0);
+#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
+  }
+#endif
+}
+
+}
+}
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f28f396323a5be72e22d378db372fc55f3614f3
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@@ -0,0 +1,322 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <limits>
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void HostThreadTeamData::organize_pool
+  ( HostThreadTeamData * members[] , const int size )
+{
+  bool ok = true ;
+
+  memory_fence();
+
+  // Verify not already a member of a pool:
+  for ( int rank = 0 ; rank < size && ok ; ++rank ) {
+    ok = ( nullptr != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
+  }
+
+  if ( ok ) {
+
+    int64_t * const root_scratch = members[0]->m_scratch ;
+
+    for ( int i = m_pool_rendezvous ; i < m_pool_reduce ; ++i ) {
+      root_scratch[i] = 0 ;
+    }
+
+    {
+      HostThreadTeamData ** const pool =
+        (HostThreadTeamData **) (root_scratch + m_pool_members);
+
+      // team size == 1, league size == pool_size
+
+      for ( int rank = 0 ; rank < size ; ++rank ) {
+        HostThreadTeamData * const mem = members[ rank ] ;
+        mem->m_pool_scratch = root_scratch ;
+        mem->m_team_scratch = mem->m_scratch ;
+        mem->m_pool_rank    = rank ;
+        mem->m_pool_size    = size ;
+        mem->m_team_base    = rank ;
+        mem->m_team_rank    = 0 ;
+        mem->m_team_size    = 1 ;
+        mem->m_team_alloc   = 1 ;
+        mem->m_league_rank  = rank ;
+        mem->m_league_size  = size ;
+        mem->m_team_rendezvous_step = 0 ;
+        pool[ rank ] = mem ;
+      }
+    }
+
+    Kokkos::memory_fence();
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_pool ERROR pool already exists");
+  }
+}
+
+void HostThreadTeamData::disband_pool()
+{
+   m_work_range.first  = -1 ;
+   m_work_range.second = -1 ;
+   m_pool_scratch = 0 ;
+   m_team_scratch = 0 ;
+   m_pool_rank    = 0 ;
+   m_pool_size    = 1 ;
+   m_team_base    = 0 ;
+   m_team_rank    = 0 ;
+   m_team_size    = 1 ;
+   m_team_alloc   = 1 ;
+   m_league_rank  = 0 ;
+   m_league_size  = 1 ;
+   m_team_rendezvous_step = 0 ;
+}
+
+int HostThreadTeamData::organize_team( const int team_size )
+{
+  // Pool is initialized
+  const bool ok_pool = 0 != m_pool_scratch ;
+
+  // Team is not set
+  const bool ok_team =
+    m_team_scratch == m_scratch &&
+    m_team_base    == m_pool_rank &&
+    m_team_rank    == 0 &&
+    m_team_size    == 1 &&
+    m_team_alloc   == 1 &&
+    m_league_rank  == m_pool_rank &&
+    m_league_size  == m_pool_size ;
+
+  if ( ok_pool && ok_team ) {
+
+    if ( team_size <= 0 ) return 0 ; // No teams to organize
+
+    if ( team_size == 1 ) return 1 ; // Already organized in teams of one
+
+    HostThreadTeamData * const * const pool =
+      (HostThreadTeamData **) (m_pool_scratch + m_pool_members);
+
+    // "league_size" in this context is the number of concurrent teams
+    // that the pool can accommodate.  Excess threads are idle.
+    const int league_size     = m_pool_size / team_size ;
+    const int team_alloc_size = m_pool_size / league_size ;
+    const int team_alloc_rank = m_pool_rank % team_alloc_size ;
+    const int league_rank     = m_pool_rank / team_alloc_size ;
+    const int team_base_rank  = league_rank * team_alloc_size ;
+
+    m_team_scratch = pool[ team_base_rank ]->m_scratch ;
+    m_team_base    = team_base_rank ;
+    // This needs to check overflow, if m_pool_size % team_alloc_size !=0
+    // there are two corner cases:
+    // (i) if team_alloc_size == team_size there might be a non-full
+    //     zombi team around (for example m_pool_size = 5 and team_size = 2
+    // (ii) if team_alloc > team_size then the last team might have less
+    //      threads than the others
+    m_team_rank    = ( team_base_rank + team_size <= m_pool_size ) &&
+                     ( team_alloc_rank < team_size ) ?
+                     team_alloc_rank : -1;
+    m_team_size    = team_size ;
+    m_team_alloc   = team_alloc_size ;
+    m_league_rank  = league_rank ;
+    m_league_size  = league_size ;
+    m_team_rendezvous_step = 0 ;
+
+    if ( team_base_rank == m_pool_rank ) {
+      // Initialize team's rendezvous memory
+      for ( int i = m_team_rendezvous ; i < m_pool_reduce ; ++i ) {
+        m_scratch[i] = 0 ;
+      }
+      // Make sure team's rendezvous memory initialized
+      // is written before proceeding.
+      Kokkos::memory_fence();
+    }
+
+    // Organizing threads into a team performs a barrier across the
+    // entire pool to insure proper initialization of the team
+    // rendezvous mechanism before a team rendezvous can be performed.
+
+    if ( pool_rendezvous() ) {
+      pool_rendezvous_release();
+    }
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_team ERROR");
+  }
+
+  return 0 <= m_team_rank ;
+}
+
+void HostThreadTeamData::disband_team()
+{
+  m_team_scratch = m_scratch ;
+  m_team_base    = m_pool_rank ;
+  m_team_rank    = 0 ;
+  m_team_size    = 1 ;
+  m_team_alloc   = 1 ;
+  m_league_rank  = m_pool_rank ;
+  m_league_size  = m_pool_size ;
+  m_team_rendezvous_step = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+int HostThreadTeamData::get_work_stealing() noexcept
+{
+  pair_int_t w( -1 , -1 );
+
+  if ( 1 == m_team_size || team_rendezvous() ) {
+
+    // Attempt first from beginning of my work range
+    for ( int attempt = m_work_range.first < m_work_range.second ; attempt ; ) {
+
+      // Query and attempt to update m_work_range
+      //   from: [ w.first     , w.second )
+      //   to:   [ w.first + 1 , w.second ) = w_new
+      //
+      // If w is invalid then is just a query.
+
+      const pair_int_t w_new( w.first + 1 , w.second );
+
+      w = Kokkos::atomic_compare_exchange( & m_work_range, w, w_new );
+
+      if ( w.first < w.second ) {
+        // m_work_range is viable
+
+        // If steal is successful then don't repeat attempt to steal
+        attempt = ! ( w_new.first  == w.first + 1 &&
+                      w_new.second == w.second );
+      }
+      else {
+        // m_work_range is not viable
+        w.first  = -1 ;
+        w.second = -1 ;
+
+        attempt = 0 ;
+      }
+    }
+
+    if ( w.first == -1 && m_steal_rank != m_pool_rank ) {
+
+      HostThreadTeamData * const * const pool =
+        (HostThreadTeamData**)( m_pool_scratch + m_pool_members );
+
+      // Attempt from begining failed, try to steal from end of neighbor
+
+      pair_int_t volatile * steal_range =
+        & ( pool[ m_steal_rank ]->m_work_range );
+
+      for ( int attempt = true ; attempt ; ) {
+
+        // Query and attempt to update steal_work_range
+        //   from: [ w.first , w.second )
+        //   to:   [ w.first , w.second - 1 ) = w_new
+        //
+        // If w is invalid then is just a query.
+
+        const pair_int_t w_new( w.first , w.second - 1 );
+
+        w = Kokkos::atomic_compare_exchange( steal_range, w, w_new );
+
+        if ( w.first < w.second ) {
+          // steal_work_range is viable
+
+          // If steal is successful then don't repeat attempt to steal
+          attempt = ! ( w_new.first  == w.first &&
+                        w_new.second == w.second - 1 );
+        }
+        else {
+          // steal_work_range is not viable, move to next member
+          w.first  = -1 ;
+          w.second = -1 ;
+
+          // We need to figure out whether the next team is active
+          // m_steal_rank + m_team_alloc could be the next base_rank to steal from
+          // but only if there are another m_team_size threads available so that that
+          // base rank has a full team.
+          m_steal_rank = m_steal_rank + m_team_alloc + m_team_size <= m_pool_size ?
+                         m_steal_rank + m_team_alloc : 0;
+
+          steal_range = & ( pool[ m_steal_rank ]->m_work_range );
+
+          // If tried all other members then don't repeat attempt to steal
+          attempt = m_steal_rank != m_pool_rank ;
+        }
+      }
+
+      if ( w.first != -1 ) w.first = w.second - 1 ;
+    }
+
+    if ( 1 < m_team_size ) {
+      // Must share the work index
+      *((int volatile *) team_reduce()) = w.first ;
+
+      team_rendezvous_release();
+    }
+  }
+  else if ( 1 < m_team_size ) {
+    w.first = *((int volatile *) team_reduce());
+  }
+
+  // May exit because successfully stole work and w is good.
+  // May exit because no work left to steal and w = (-1,-1).
+
+#if 0
+fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
+       , m_pool_rank , m_pool_size , w.first );
+fflush(stdout);
+#endif
+
+  return w.first ;
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..78770f2d5ea383e28ec3e6274a797d353d84f834
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -0,0 +1,1082 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP
+#define KOKKOS_IMPL_HOSTTHREADTEAM_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
+#include <impl/Kokkos_HostBarrier.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class HostExecSpace >
+class HostThreadTeamMember ;
+
+class HostThreadTeamData {
+public:
+
+  template< class > friend class HostThreadTeamMember ;
+
+  // Assume upper bounds on number of threads:
+  //   pool size       <= 1024 threads
+  //   team size       <= 64 threads
+
+  enum : int { max_pool_members  = 1024 };
+  enum : int { max_team_members  = 64 };
+  enum : int { max_pool_rendezvous = rendezvous_buffer_size( max_pool_members ) };
+  enum : int { max_team_rendezvous = rendezvous_buffer_size( max_team_members ) };
+
+private:
+
+  // per-thread scratch memory buffer chunks:
+  //
+  //   [ pool_members ]     = [ m_pool_members    .. m_pool_rendezvous )
+  //   [ pool_rendezvous ]  = [ m_pool_rendezvous .. m_team_rendezvous )
+  //   [ team_rendezvous ]  = [ m_team_rendezvous .. m_pool_reduce )
+  //   [ pool_reduce ]      = [ m_pool_reduce     .. m_team_reduce )
+  //   [ team_reduce ]      = [ m_team_reduce     .. m_team_shared )
+  //   [ team_shared ]      = [ m_team_shared     .. m_thread_local )
+  //   [ thread_local ]     = [ m_thread_local    .. m_scratch_size )
+
+  enum : int { m_pool_members    = 0 };
+  enum : int { m_pool_rendezvous = m_pool_members    + max_pool_members };
+  enum : int { m_team_rendezvous = m_pool_rendezvous + max_pool_rendezvous };
+  enum : int { m_pool_reduce     = m_team_rendezvous + max_team_rendezvous };
+
+  using pair_int_t = Kokkos::pair<int,int> ;
+
+  pair_int_t  m_work_range ;
+  int64_t     m_work_end ;
+  int64_t   * m_scratch ;       // per-thread buffer
+  int64_t   * m_pool_scratch ;  // == pool[0]->m_scratch
+  int64_t   * m_team_scratch ;  // == pool[ 0 + m_team_base ]->m_scratch
+  int         m_pool_rank ;
+  int         m_pool_size ;
+  int         m_team_reduce ;
+  int         m_team_shared ;
+  int         m_thread_local ;
+  int         m_scratch_size ;
+  int         m_team_base ;
+  int         m_team_rank ;
+  int         m_team_size ;
+  int         m_team_alloc ;
+  int         m_league_rank ;
+  int         m_league_size ;
+  int         m_work_chunk ;
+  int         m_steal_rank ; // work stealing rank
+  uint64_t mutable m_pool_rendezvous_step ;
+  uint64_t mutable m_team_rendezvous_step ;
+
+  HostThreadTeamData * team_member( int r ) const noexcept
+    { return ((HostThreadTeamData**)(m_pool_scratch+m_pool_members))[m_team_base+r]; }
+
+public:
+
+  inline
+  int team_rendezvous( int const root ) const noexcept
+    {
+      return 1 == m_team_size ? 1 :
+             rendezvous( m_team_scratch + m_team_rendezvous
+                       , m_team_rendezvous_step
+                       , m_team_size
+                       , ( m_team_rank + m_team_size - root ) % m_team_size
+                       );
+    }
+
+  inline
+  int team_rendezvous() const noexcept
+    {
+      return 1 == m_team_size ? 1 :
+             rendezvous( m_team_scratch + m_team_rendezvous
+                       , m_team_rendezvous_step
+                       , m_team_size
+                       , m_team_rank );
+    }
+
+  inline
+  void team_rendezvous_release() const noexcept
+    {
+      if ( 1 < m_team_size ) {
+        rendezvous_release( m_team_scratch + m_team_rendezvous
+                          , m_team_rendezvous_step );
+      }
+    }
+
+  inline
+  int pool_rendezvous() const noexcept
+    {
+      static constexpr bool active_wait =
+        #if defined( KOKKOS_COMPILER_IBM )
+            // If running on IBM POWER architecture the global
+            // level rendzvous should immediately yield when
+            // waiting for other threads in the pool to arrive.
+          false
+        #else
+          true
+        #endif
+          ;
+      return 1 == m_pool_size ? 1 :
+             rendezvous( m_pool_scratch + m_pool_rendezvous
+                       , m_pool_rendezvous_step
+                       , m_pool_size
+                       , m_pool_rank
+                       , active_wait
+                       );
+    }
+
+  inline
+  void pool_rendezvous_release() const noexcept
+    {
+      if ( 1 < m_pool_size ) {
+        rendezvous_release( m_pool_scratch + m_pool_rendezvous, m_pool_rendezvous_step );
+      }
+    }
+
+  //----------------------------------------
+
+  constexpr HostThreadTeamData() noexcept
+    : m_work_range(-1,-1)
+    , m_work_end(0)
+    , m_scratch(0)
+    , m_pool_scratch(0)
+    , m_team_scratch(0)
+    , m_pool_rank(0)
+    , m_pool_size(1)
+    , m_team_reduce(0)
+    , m_team_shared(0)
+    , m_thread_local(0)
+    , m_scratch_size(0)
+    , m_team_base(0)
+    , m_team_rank(0)
+    , m_team_size(1)
+    , m_team_alloc(1)
+    , m_league_rank(0)
+    , m_league_size(1)
+    , m_work_chunk(0)
+    , m_steal_rank(0)
+    , m_pool_rendezvous_step(0)
+    , m_team_rendezvous_step(0)
+    {}
+
+  //----------------------------------------
+  // Organize array of members into a pool.
+  // The 0th member is the root of the pool.
+  // Requires: members are not already in a pool.
+  // Requires: called by one thread.
+  // Pool members are ordered as "close" - sorted by NUMA and then CORE
+  // Each thread is its own team with team_size == 1.
+  static void organize_pool( HostThreadTeamData * members[]
+                           , const int size );
+
+  // Called by each thread within the pool
+  void disband_pool();
+
+  //----------------------------------------
+  // Each thread within a pool organizes itself into a team.
+  // Must be called by all threads of the pool.
+  // Organizing threads into a team performs a barrier across the
+  // entire pool to insure proper initialization of the team
+  // rendezvous mechanism before a team rendezvous can be performed.
+  //
+  // Return true  if a valid member of a team.
+  // Return false if not a member and thread should be idled.
+  int organize_team( const int team_size );
+
+  // Each thread within a pool disbands itself from current team.
+  // Each thread becomes its own team with team_size == 1.
+  // Must be called by all threads of the pool.
+  void disband_team();
+
+  //----------------------------------------
+
+  constexpr int pool_rank() const { return m_pool_rank ; }
+  constexpr int pool_size() const { return m_pool_size ; }
+
+  HostThreadTeamData * pool_member( int r ) const noexcept
+    { return ((HostThreadTeamData**)(m_pool_scratch+m_pool_members))[r]; }
+
+  //----------------------------------------
+
+private:
+
+  enum : int { mask_to_16 = 0x0f }; // align to 16 bytes
+  enum : int { shift_to_8 = 3 };    // size to 8 bytes
+
+public:
+
+  static constexpr int align_to_int64( int n )
+    { return ( ( n + mask_to_16 ) & ~mask_to_16 ) >> shift_to_8 ; }
+
+  constexpr int pool_reduce_bytes() const
+    { return m_scratch_size ? sizeof(int64_t) * ( m_team_reduce - m_pool_reduce ) : 0 ; }
+
+  constexpr int team_reduce_bytes() const
+    { return sizeof(int64_t) * ( m_team_shared - m_team_reduce ); }
+
+  constexpr int team_shared_bytes() const
+    { return sizeof(int64_t) * ( m_thread_local - m_team_shared ); }
+
+  constexpr int thread_local_bytes() const
+    { return sizeof(int64_t) * ( m_scratch_size - m_thread_local ); }
+
+  constexpr int scratch_bytes() const
+    { return sizeof(int64_t) * m_scratch_size ; }
+
+  // Memory chunks:
+
+  int64_t * scratch_buffer() const noexcept
+    { return m_scratch ; }
+
+  int64_t * pool_reduce() const noexcept
+    { return m_pool_scratch + m_pool_reduce ; }
+
+  int64_t * pool_reduce_local() const noexcept
+    { return m_scratch + m_pool_reduce ; }
+
+  int64_t * team_reduce() const noexcept
+    { return m_team_scratch + m_team_reduce ; }
+
+  int64_t * team_reduce_local() const noexcept
+    { return m_scratch + m_team_reduce ; }
+
+  int64_t * team_shared() const noexcept
+    { return m_team_scratch + m_team_shared ; }
+
+  int64_t * local_scratch() const noexcept
+    { return m_scratch + m_thread_local ; }
+
+  // Given:
+  //   pool_reduce_size  = number bytes for pool reduce
+  //   team_reduce_size  = number bytes for team reduce
+  //   team_shared_size  = number bytes for team shared memory
+  //   thread_local_size = number bytes for thread local memory
+  // Return:
+  //   total number of bytes that must be allocated
+  static
+  size_t scratch_size( int pool_reduce_size
+                     , int team_reduce_size
+                     , int team_shared_size
+                     , int thread_local_size )
+    {
+      pool_reduce_size  = align_to_int64( pool_reduce_size );
+      team_reduce_size  = align_to_int64( team_reduce_size );
+      team_shared_size  = align_to_int64( team_shared_size );
+      thread_local_size = align_to_int64( thread_local_size );
+
+      const size_t total_bytes = (
+        m_pool_reduce +
+        pool_reduce_size +
+        team_reduce_size +
+        team_shared_size +
+        thread_local_size ) * sizeof(int64_t);
+
+      return total_bytes ;
+    }
+
+  // Given:
+  //   alloc_ptr         = pointer to allocated memory
+  //   alloc_size        = number bytes of allocated memory
+  //   pool_reduce_size  = number bytes for pool reduce/scan operations
+  //   team_reduce_size  = number bytes for team reduce/scan operations
+  //   team_shared_size  = number bytes for team-shared memory
+  //   thread_local_size = number bytes for thread-local memory
+  // Return:
+  //   total number of bytes that must be allocated
+  void scratch_assign( void * const alloc_ptr
+                     , size_t const alloc_size
+                     , int pool_reduce_size
+                     , int team_reduce_size
+                     , int team_shared_size
+                     , int /* thread_local_size */ )
+    {
+      pool_reduce_size  = align_to_int64( pool_reduce_size );
+      team_reduce_size  = align_to_int64( team_reduce_size );
+      team_shared_size  = align_to_int64( team_shared_size );
+      // thread_local_size = align_to_int64( thread_local_size );
+
+      m_scratch      = (int64_t *) alloc_ptr ;
+      m_team_reduce  = m_pool_reduce + pool_reduce_size ;
+      m_team_shared  = m_team_reduce + team_reduce_size ;
+      m_thread_local = m_team_shared + team_shared_size ;
+      m_scratch_size = align_to_int64( alloc_size );
+
+#if 0
+fprintf(stdout,"HostThreadTeamData::scratch_assign { %d %d %d %d %d %d %d }\n"
+       , int(m_pool_members)
+       , int(m_pool_rendezvous)
+       , int(m_pool_reduce)
+       , int(m_team_reduce)
+       , int(m_team_shared)
+       , int(m_thread_local)
+       , int(m_scratch_size)
+       );
+fflush(stdout);
+#endif
+
+    }
+
+  //----------------------------------------
+  // Get a work index within the range.
+  // First try to steal from beginning of own teams's partition.
+  // If that fails then try to steal from end of another teams' partition.
+  int get_work_stealing() noexcept ;
+
+  //----------------------------------------
+  // Set the initial work partitioning of [ 0 .. length ) among the teams
+  // with granularity of chunk
+
+  void set_work_partition( int64_t const length
+                         , int     const chunk ) noexcept
+    {
+      // Minimum chunk size to insure that
+      //   m_work_end < std::numeric_limits<int>::max() * m_work_chunk
+
+      int const chunk_min = ( length + std::numeric_limits<int>::max() )
+                            / std::numeric_limits<int>::max();
+
+      m_work_end   = length ;
+      m_work_chunk = std::max( chunk , chunk_min );
+
+      // Number of work chunks and partitioning of that number:
+      int const num  = ( m_work_end + m_work_chunk - 1 ) / m_work_chunk ;
+      int const part = ( num + m_league_size - 1 ) / m_league_size ;
+
+      m_work_range.first  = part * m_league_rank ;
+      m_work_range.second = m_work_range.first + part ;
+
+      // Steal from next team, round robin
+      // The next team is offset by m_team_alloc if it fits in the pool.
+
+      m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ?
+                     m_team_base + m_team_alloc : 0 ;
+    }
+
+  std::pair<int64_t,int64_t> get_work_partition() noexcept
+    {
+      int64_t first = m_work_range.first;
+      int64_t second = m_work_range.second;
+      first *= m_work_chunk;
+      second *= m_work_chunk;
+      return std::pair<int64_t,int64_t>
+        ( first
+        , second < m_work_end ? second : m_work_end );
+    }
+
+  std::pair<int64_t,int64_t> get_work_stealing_chunk() noexcept
+    {
+      std::pair<int64_t,int64_t> x(-1,-1);
+
+      const int i = get_work_stealing();
+
+      if ( 0 <= i ) {
+        x.first  = m_work_chunk * i ;
+        x.second = x.first + m_work_chunk < m_work_end
+                 ? x.first + m_work_chunk : m_work_end ;
+      }
+
+      return x ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class HostExecSpace >
+class HostThreadTeamMember {
+public:
+
+  using scratch_memory_space = typename HostExecSpace::scratch_memory_space ;
+
+private:
+
+  scratch_memory_space m_scratch ;
+  HostThreadTeamData & m_data ;
+  int const            m_league_rank ;
+  int const            m_league_size ;
+
+public:
+
+  constexpr HostThreadTeamMember( HostThreadTeamData & arg_data ) noexcept
+    : m_scratch( arg_data.team_shared() , arg_data.team_shared_bytes() )
+    , m_data( arg_data )
+    , m_league_rank(0)
+    , m_league_size(1)
+    {}
+
+  constexpr HostThreadTeamMember( HostThreadTeamData & arg_data
+                                , int const            arg_league_rank
+                                , int const            arg_league_size
+                                ) noexcept
+    : m_scratch( arg_data.team_shared()
+               , arg_data.team_shared_bytes()
+               , arg_data.team_shared()
+               , arg_data.team_shared_bytes() )
+    , m_data( arg_data )
+    , m_league_rank( arg_league_rank )
+    , m_league_size( arg_league_size )
+    {}
+
+  ~HostThreadTeamMember() = default ;
+  HostThreadTeamMember() = delete ;
+  HostThreadTeamMember( HostThreadTeamMember && ) = default ;
+  HostThreadTeamMember( HostThreadTeamMember const & ) = default ;
+  HostThreadTeamMember & operator = ( HostThreadTeamMember && ) = default ;
+  HostThreadTeamMember & operator = ( HostThreadTeamMember const & ) = default ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const noexcept { return m_data.m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const noexcept { return m_data.m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int league_rank() const noexcept { return m_league_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int league_size() const noexcept { return m_league_size ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const
+    { return m_scratch.set_team_thread_mode(0,1,0); }
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_scratch(int) const
+    { return m_scratch.set_team_thread_mode(0,1,0); }
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & thread_scratch(int) const
+    { return m_scratch.set_team_thread_mode(0,m_data.m_team_size,m_data.m_team_rank); }
+
+  //--------------------------------------------------------------------------
+  // Team collectives
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( m_data.team_rendezvous() ) m_data.team_rendezvous_release();
+    }
+#else
+    {}
+#endif
+
+  //--------------------------------------------------------------------------
+
+  template< typename T >
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast( T & value , const int source_team_rank ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 1 < m_data.m_team_size ) {
+        T volatile * const shared_value = (T*) m_data.team_reduce();
+
+        // Don't overwrite shared memory until all threads arrive
+
+        if ( m_data.team_rendezvous( source_team_rank ) ) {
+          // All threads have entered 'team_rendezvous'
+          // only this thread returned from 'team_rendezvous'
+          // with a return value of 'true'
+
+          *shared_value = value ;
+
+          m_data.team_rendezvous_release();
+          // This thread released all other threads from 'team_rendezvous'
+          // with a return value of 'false'
+        }
+        else {
+          value = *shared_value ;
+        }
+      }
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_broadcast\n"); }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  template< class Closure , typename T >
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast( Closure const & f , T & value , const int source_team_rank) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      T volatile * const shared_value = (T*) m_data.team_reduce();
+
+      // Don't overwrite shared memory until all threads arrive
+
+      if ( m_data.team_rendezvous(source_team_rank) ) {
+
+        // All threads have entered 'team_rendezvous'
+        // only this thread returned from 'team_rendezvous'
+        // with a return value of 'true'
+
+        f( value );
+
+        if ( 1 < m_data.m_team_size ) { *shared_value = value ; }
+
+        m_data.team_rendezvous_release();
+        // This thread released all other threads from 'team_rendezvous'
+        // with a return value of 'false'
+      }
+      else {
+        value = *shared_value ;
+      }
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_broadcast\n"); }
+#endif
+
+  //--------------------------------------------------------------------------
+  // team_reduce( Sum(result) );
+  // team_reduce( Min(result) );
+  // team_reduce( Max(result) );
+
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  team_reduce( ReducerType const & reducer ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 1 < m_data.m_team_size ) {
+
+        using value_type = typename ReducerType::value_type ;
+
+        if ( 0 != m_data.m_team_rank ) {
+          // Non-root copies to their local buffer:
+          /*reducer.copy( (value_type*) m_data.team_reduce_local()
+                      , reducer.data() );*/
+          *((value_type*) m_data.team_reduce_local()) = reducer.reference();
+        }
+
+        // Root does not overwrite shared memory until all threads arrive
+        // and copy to their local buffer.
+
+        if ( m_data.team_rendezvous() ) {
+          // All threads have entered 'team_rendezvous'
+          // only this thread returned from 'team_rendezvous'
+          // with a return value of 'true'
+          //
+          // This thread sums contributed values
+          for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+            value_type * const src =
+              (value_type*) m_data.team_member(i)->team_reduce_local();
+
+            reducer.join( reducer.reference(), *src);
+          }
+
+          // Copy result to root member's buffer:
+          // reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
+          *((value_type*) m_data.team_reduce()) = reducer.reference();
+          m_data.team_rendezvous_release();
+          // This thread released all other threads from 'team_rendezvous'
+          // with a return value of 'false'
+        }
+        else {
+          // Copy from root member's buffer:
+          reducer.reference() = *((value_type*) m_data.team_reduce());
+        }
+      }
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_reduce\n"); }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  /*template< typename ValueType , class JoinOp >
+  KOKKOS_INLINE_FUNCTION
+  ValueType
+  team_reduce( ValueType const & value
+             , JoinOp    const & join ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 0 != m_data.m_team_rank ) {
+        // Non-root copies to their local buffer:
+        *((ValueType*) m_data.team_reduce_local()) = value ;
+      }
+
+      // Root does not overwrite shared memory until all threads arrive
+      // and copy to their local buffer.
+
+      if ( m_data.team_rendezvous() ) {
+        const Impl::Reducer< ValueType , JoinOp > reducer( join );
+
+        // All threads have entered 'team_rendezvous'
+        // only this thread returned from 'team_rendezvous'
+        // with a return value of 'true'
+        //
+        // This thread sums contributed values
+
+        ValueType * const dst = (ValueType*) m_data.team_reduce_local();
+
+        *dst = value ;
+
+        for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+          ValueType * const src =
+            (ValueType*) m_data.team_member(i)->team_reduce_local();
+
+          reducer.join( dst , src );
+        }
+
+        m_data.team_rendezvous_release();
+        // This thread released all other threads from 'team_rendezvous'
+        // with a return value of 'false'
+      }
+
+      return *((ValueType*) m_data.team_reduce());
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_reduce\n"); return ValueType(); }
+#endif*/
+
+
+  template< typename T >
+  KOKKOS_INLINE_FUNCTION
+  T team_scan( T const & value , T * const global = 0 ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 0 != m_data.m_team_rank ) {
+        // Non-root copies to their local buffer:
+        ((T*) m_data.team_reduce_local())[1] = value ;
+      }
+
+      // Root does not overwrite shared memory until all threads arrive
+      // and copy to their local buffer.
+
+      if ( m_data.team_rendezvous() ) {
+        // All threads have entered 'team_rendezvous'
+        // only this thread returned from 'team_rendezvous'
+        // with a return value of 'true'
+        //
+        // This thread scans contributed values
+
+        {
+          T * prev = (T*) m_data.team_reduce_local();
+
+          prev[0] = 0 ;
+          prev[1] = value ;
+
+          for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+            T * const ptr = (T*) m_data.team_member(i)->team_reduce_local();
+
+            ptr[0] = prev[0] + prev[1] ;
+
+            prev = ptr ;
+          }
+        }
+
+        // If adding to global value then atomic_fetch_add to that value
+        // and sum previous value to every entry of the scan.
+        if ( global ) {
+          T * prev = (T*) m_data.team_reduce_local();
+
+          {
+            T * ptr  = (T*) m_data.team_member( m_data.m_team_size - 1 )->team_reduce_local();
+            prev[0] = Kokkos::atomic_fetch_add( global , ptr[0] + ptr[1] );
+          }
+
+          for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+            T * ptr = (T*) m_data.team_member(i)->team_reduce_local();
+            ptr[0] += prev[0] ;
+          }
+        }
+
+        m_data.team_rendezvous_release();
+      }
+
+      return ((T*) m_data.team_reduce_local())[0];
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_scan\n"); return T(); }
+#endif
+
+};
+
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<class Space,typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+TeamThreadRange( Impl::HostThreadTeamMember<Space> const & member
+               , iType const & count )
+{
+  return
+    Impl::TeamThreadRangeBoundariesStruct
+      <iType,Impl::HostThreadTeamMember<Space> >(member,0,count);
+}
+
+template<class Space, typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct
+  < typename std::common_type< iType1, iType2 >::type
+  , Impl::HostThreadTeamMember<Space> >
+TeamThreadRange( Impl::HostThreadTeamMember<Space> const & member
+               , iType1 const & begin , iType2 const & end )
+{
+  return
+    Impl::TeamThreadRangeBoundariesStruct
+      < typename std::common_type< iType1, iType2 >::type
+      , Impl::HostThreadTeamMember<Space> >( member , begin , end );
+}
+
+template<class Space, typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+ThreadVectorRange
+  ( Impl::HostThreadTeamMember<Space> const & member
+  , const iType & count )
+{
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >(member,count);
+}
+
+//----------------------------------------------------------------------------
+/** \brief  Inter-thread parallel_for.
+ *
+ * Executes lambda(iType i) for each i=[0..N)
+ *
+ * The range [0..N) is mapped to all threads of the the calling thread team.
+*/
+template<typename iType, class Space, class Closure>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , Closure const & closure
+  )
+{
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure (i);
+  }
+}
+
+template<typename iType, class Space, class Closure>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , Closure const & closure
+  )
+{
+  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure (i);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename iType, class Space, class Closure, class Reducer >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< Reducer >::value >::type
+parallel_reduce
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+             const & loop_boundaries
+  , Closure  const & closure
+  , Reducer  const & reducer
+  )
+{
+  reducer.init( reducer.reference() );
+
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure( i , reducer.reference() );
+  }
+
+  loop_boundaries.thread.team_reduce( reducer );
+}
+
+template< typename iType, class Space, typename Closure, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< ! Kokkos::is_reducer<ValueType>::value >::type
+parallel_reduce
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+             const & loop_boundaries
+  , Closure  const & closure
+  , ValueType      & result
+  )
+{
+  Kokkos::Experimental::Sum<ValueType> reducer( result );
+
+  reducer.init( result );
+
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure( i , reducer.reference() );
+  }
+
+  loop_boundaries.thread.team_reduce( reducer );
+}
+
+/*template< typename iType, class Space
+         , class Closure, class Joiner , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+             const & loop_boundaries
+  , Closure  const & closure
+  , Joiner   const & joiner
+  , ValueType      & result
+  )
+{
+  Impl::Reducer< ValueType , Joiner > reducer( joiner , & result );
+
+  reducer.init( reducer.data() );
+
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure( i , reducer.reference() );
+  }
+
+  loop_boundaries.thread.team_reduce( reducer );
+}*/
+
+//----------------------------------------------------------------------------
+/** \brief  Inter-thread vector parallel_reduce.
+ *
+ *  Executes lambda(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all threads of the
+ *  calling thread team and a summation of  val is
+ *  performed and put into result.
+ */
+template< typename iType, class Space , class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< ! Kokkos::is_reducer<ValueType>::value >::type
+parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& result)
+{
+  result = ValueType();
+  for( iType i =  loop_boundaries.start ;
+             i <  loop_boundaries.end ;
+             i += loop_boundaries.increment) {
+    lambda(i,result);
+  }
+}
+
+template< typename iType, class Space , class Lambda, typename ReducerType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
+parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
+   const Lambda & lambda,
+   const ReducerType& reducer)
+{
+  reducer.init(reducer.reference());
+  for( iType i =  loop_boundaries.start ;
+             i <  loop_boundaries.end ;
+             i += loop_boundaries.increment) {
+    lambda(i,reducer.reference());
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce.
+ *
+ *  Executes lambda(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes of the the
+ *  calling thread and a reduction of val is performed using
+ *  JoinType(ValueType& val, const ValueType& update)
+ *  and put into init_result.
+ *  The input value of init_result is used as initializer for
+ *  temporary variables of ValueType. Therefore * the input
+ *  value should be the neutral element with respect to the
+ *  join operation (e.g. '0 for +-' or * '1 for *').
+ */
+template< typename iType, class Space
+        , class Lambda, class JoinType , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& result)
+{
+  for( iType i =  loop_boundaries.start ;
+             i <  loop_boundaries.end ;
+             i += loop_boundaries.increment ) {
+    lambda(i,result);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename iType, class Space, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , Closure const & closure
+  )
+{
+  // Extract ValueType from the closure
+
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  value_type accum = 0 ;
+
+  // Intra-member scan
+  for ( iType i = loop_boundaries.start
+      ; i <  loop_boundaries.end
+      ; i += loop_boundaries.increment ) {
+    closure(i,accum,false);
+  }
+
+  // 'accum' output is the exclusive prefix sum
+  accum = loop_boundaries.thread.team_scan(accum);
+
+  for ( iType i = loop_boundaries.start
+      ; i <  loop_boundaries.end
+      ; i += loop_boundaries.increment ) {
+    closure(i,accum,true);
+  }
+}
+
+
+template< typename iType, class Space, class ClosureType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , ClosureType const & closure
+  )
+{
+  using value_type = typename
+    Kokkos::Impl::FunctorAnalysis
+      < Impl::FunctorPatternInterface::SCAN
+      , void
+      , ClosureType >::value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for ( iType i = loop_boundaries.start
+      ; i <  loop_boundaries.end
+      ; i += loop_boundaries.increment ) {
+    closure(i,scan_val,true);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class Space >
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::HostThreadTeamMember<Space> >
+PerTeam(const Impl::HostThreadTeamMember<Space> & member )
+{
+  return Impl::ThreadSingleStruct<Impl::HostThreadTeamMember<Space> >(member);
+}
+
+template< class Space >
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::HostThreadTeamMember<Space> >
+PerThread(const Impl::HostThreadTeamMember<Space> & member)
+{
+  return Impl::VectorSingleStruct<Impl::HostThreadTeamMember<Space> >(member);
+}
+
+template< class Space , class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::ThreadSingleStruct< Impl::HostThreadTeamMember<Space> > & single , const FunctorType & functor )
+{
+  // 'single' does not perform a barrier.
+  if ( single.team_member.team_rank() == 0 ) functor();
+}
+
+template< class Space , class FunctorType , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::ThreadSingleStruct< Impl::HostThreadTeamMember<Space> > & single , const FunctorType & functor , ValueType & val )
+{
+  single.team_member.team_broadcast( functor , val , 0 );
+}
+
+template< class Space , class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::VectorSingleStruct< Impl::HostThreadTeamMember<Space> > & , const FunctorType & functor )
+{
+  functor();
+}
+
+template< class Space , class FunctorType , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::VectorSingleStruct< Impl::HostThreadTeamMember<Space> > & , const FunctorType & functor , ValueType & val )
+{
+  functor(val);
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_MemoryPool.cpp b/packages/kokkos/core/src/impl/Kokkos_MemoryPool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0394ccb5ad2c4dbbc28361002f0076fa91ca65f1
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_MemoryPool.cpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <ostream>
+#include <sstream>
+#include <impl/Kokkos_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/* Verify size constraints:
+ *   min_block_alloc_size <= max_block_alloc_size
+ *   max_block_alloc_size <= min_superblock_size 
+ *   min_superblock_size  <= max_superblock_size
+ *   min_superblock_size  <= min_total_alloc_size
+ *   min_superblock_size  <= min_block_alloc_size * 
+ *                           max_block_per_superblock
+ */
+void memory_pool_bounds_verification
+  ( size_t min_block_alloc_size
+  , size_t max_block_alloc_size
+  , size_t min_superblock_size
+  , size_t max_superblock_size
+  , size_t max_block_per_superblock
+  , size_t min_total_alloc_size
+  )
+{
+  const size_t max_superblock =
+    min_block_alloc_size * max_block_per_superblock ;
+
+  if ( ( size_t(max_superblock_size) < min_superblock_size ) ||
+       ( min_total_alloc_size < min_superblock_size ) ||
+       ( max_superblock       < min_superblock_size ) ||
+       ( min_superblock_size  < max_block_alloc_size ) ||
+       ( max_block_alloc_size < min_block_alloc_size ) ) {
+
+    std::ostringstream msg ;
+
+    msg << "Kokkos::MemoryPool size constraint violation" ;
+
+    if ( size_t(max_superblock_size) < min_superblock_size ) {
+      msg << " : max_superblock_size("
+          << max_superblock_size
+          << ") < min_superblock_size("
+          << min_superblock_size << ")" ;
+    }
+
+    if ( min_total_alloc_size < min_superblock_size ) {
+      msg << " : min_total_alloc_size("
+          << min_total_alloc_size
+          << ") < min_superblock_size("
+          << min_superblock_size << ")" ;
+    }
+
+    if ( max_superblock < min_superblock_size ) {
+      msg << " : max_superblock("
+          << max_superblock
+          << ") < min_superblock_size("
+          << min_superblock_size << ")" ;
+    }
+
+    if ( min_superblock_size < max_block_alloc_size ) {
+      msg << " : min_superblock_size("
+          << min_superblock_size
+          << ") < max_block_alloc_size("
+          << max_block_alloc_size << ")" ;
+    }
+
+    if ( max_block_alloc_size < min_block_alloc_size ) {
+      msg << " : max_block_alloc_size("
+          << max_block_alloc_size
+          << ") < min_block_alloc_size("
+          << min_block_alloc_size << ")" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+}
+
+}
+}
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b74e8a7ac0d571a763db40144cbcb55e7506e39e
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -0,0 +1,105 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE_HPP )
+#define KOKKOS_MEMORY_FENCE_HPP
+
+#if !defined(_OPENMP)
+#include <atomic>
+#endif
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+KOKKOS_FORCEINLINE_FUNCTION
+void memory_fence()
+{
+#if   defined( __CUDA_ARCH__ )
+  __threadfence();
+#elif defined( _OPENMP )
+  #pragma omp flush
+#else
+  std::atomic_thread_fence( std::memory_order_seq_cst );
+#endif
+}
+
+//////////////////////////////////////////////////////
+// store_fence()
+//
+// If possible use a store fence on the architecture, if not run a full memory fence
+
+KOKKOS_FORCEINLINE_FUNCTION
+void store_fence()
+{
+#if defined( __CUDA_ARCH__ )
+  __threadfence();
+#elif defined( _OPENMP )
+  #pragma omp flush
+#else
+  std::atomic_thread_fence( std::memory_order_seq_cst );
+#endif
+}
+
+//////////////////////////////////////////////////////
+// load_fence()
+//
+// If possible use a load fence on the architecture, if not run a full memory fence
+
+KOKKOS_FORCEINLINE_FUNCTION
+void load_fence()
+{
+#if defined( __CUDA_ARCH__ )
+  __threadfence();
+#elif defined( _OPENMP )
+  #pragma omp flush
+#else
+  std::atomic_thread_fence( std::memory_order_seq_cst );
+#endif
+}
+
+} // namespace kokkos
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_OldMacros.hpp b/packages/kokkos/core/src/impl/Kokkos_OldMacros.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6144b5b70b0e195c001c9ebd54746ffd46b37234
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_OldMacros.hpp
@@ -0,0 +1,453 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_OLD_MACROS_HPP
+#define KOKKOS_IMPL_OLD_MACROS_HPP
+
+#ifdef KOKKOS_ATOMICS_USE_CUDA
+#ifndef KOKKOS_ENABLE_CUDA_ATOMICS
+#define KOKKOS_ENABLE_CUDA_ATOMICS KOKKOS_ATOMICS_USE_CUDA
+#endif
+#endif
+
+#ifdef KOKKOS_ATOMICS_USE_GCC
+#ifndef KOKKOS_ENABLE_GNU_ATOMICS
+#define KOKKOS_ENABLE_GNU_ATOMICS KOKKOS_ATOMICS_USE_GCC
+#endif
+#endif
+
+#ifdef KOKKOS_ATOMICS_USE_GNU
+#ifndef KOKKOS_ENABLE_GNU_ATOMICS
+#define KOKKOS_ENABLE_GNU_ATOMICS KOKKOS_ATOMICS_USE_GNU
+#endif
+#endif
+
+#ifdef KOKKOS_ATOMICS_USE_INTEL
+#ifndef KOKKOS_ENABLE_INTEL_ATOMICS
+#define KOKKOS_ENABLE_INTEL_ATOMICS KOKKOS_ATOMICS_USE_INTEL
+#endif
+#endif
+
+#ifdef KOKKOS_ATOMICS_USE_OMP31
+#ifndef KOKKOS_ENABLE_OPENMP_ATOMICS
+#define KOKKOS_ENABLE_OPENMP_ATOMICS KOKKOS_ATOMICS_USE_OMP31
+#endif
+#endif
+
+#ifdef KOKKOS_ATOMICS_USE_OPENMP31
+#ifndef KOKKOS_ENABLE_OPENMP_ATOMICS
+#define KOKKOS_ENABLE_OPENMP_ATOMICS KOKKOS_ATOMICS_USE_OPENMP31
+#endif
+#endif
+
+#ifdef KOKKOS_ATOMICS_USE_WINDOWS
+#ifndef KOKKOS_ENABLE_WINDOWS_ATOMICS
+#define KOKKOS_ENABLE_WINDOWS_ATOMICS KOKKOS_ATOMICS_USE_WINDOWS
+#endif
+#endif
+
+#ifdef KOKKOS_CUDA_CLANG_WORKAROUND
+#ifndef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
+#define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND KOKKOS_CUDA_CLANG_WORKAROUND
+#endif
+#endif
+
+#ifdef KOKKOS_CUDA_USE_LAMBDA
+#ifndef KOKKOS_ENABLE_CUDA_LAMBDA
+#define KOKKOS_ENABLE_CUDA_LAMBDA KOKKOS_CUDA_USE_LAMBDA
+#endif
+#endif
+
+#ifdef KOKKOS_CUDA_USE_LDG_INTRINSIC
+#ifndef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+#define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC KOKKOS_CUDA_USE_LDG_INTRINSIC
+#endif
+#endif
+
+#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+#endif
+#endif
+
+#ifdef KOKKOS_CUDA_USE_UVM
+#ifndef KOKKOS_ENABLE_CUDA_UVM
+#define KOKKOS_ENABLE_CUDA_UVM KOKKOS_CUDA_USE_UVM
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_CUDA
+#ifndef KOKKOS_ENABLE_CUDA
+#define KOKKOS_ENABLE_CUDA KOKKOS_HAVE_CUDA
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_CUDA_LAMBDA
+#ifndef KOKKOS_ENABLE_CUDA_LAMBDA
+#define KOKKOS_ENABLE_CUDA_LAMBDA KOKKOS_HAVE_CUDA_LAMBDA
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_CUDA_RDC
+#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE KOKKOS_HAVE_CUDA_RDC
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_CUSPARSE
+#ifndef KOKKOS_ENABLE_CUSPARSE
+#define KOKKOS_ENABLE_CUSPARSE KOKKOS_HAVE_CUSPARSE
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_CXX1Z
+#ifndef KOKKOS_ENABLE_CXX1Z
+#define KOKKOS_ENABLE_CXX1Z KOKKOS_HAVE_CXX1Z
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_DEBUG
+#ifndef KOKKOS_DEBUG
+#define KOKKOS_DEBUG KOKKOS_HAVE_DEBUG
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA
+#ifndef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP
+#ifndef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
+#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL
+#ifndef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
+#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS
+#ifndef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_HBWSPACE
+#ifndef KOKKOS_ENABLE_HBWSPACE
+#define KOKKOS_ENABLE_HBWSPACE KOKKOS_HAVE_HBWSPACE
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_HWLOC
+#ifndef KOKKOS_ENABLE_HWLOC
+#define KOKKOS_ENABLE_HWLOC KOKKOS_HAVE_HWLOC
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_MPI
+#ifndef KOKKOS_ENABLE_MPI
+#define KOKKOS_ENABLE_MPI KOKKOS_HAVE_MPI
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_OPENMP
+#ifndef KOKKOS_ENABLE_OPENMP
+#define KOKKOS_ENABLE_OPENMP KOKKOS_HAVE_OPENMP
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#ifndef KOKKOS_ENABLE_PRAGMA_IVDEP
+#define KOKKOS_ENABLE_PRAGMA_IVDEP KOKKOS_HAVE_PRAGMA_IVDEP
+#endif
+#endif
+
+#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+#ifndef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
+#define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
+#ifndef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
+#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT KOKKOS_HAVE_PRAGMA_LOOPCOUNT
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_PRAGMA_SIMD
+#ifndef KOKKOS_ENABLE_PRAGMA_SIMD
+#define KOKKOS_ENABLE_PRAGMA_SIMD KOKKOS_HAVE_PRAGMA_SIMD
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_PRAGMA_UNROLL
+#ifndef KOKKOS_ENABLE_PRAGMA_UNROLL
+#define KOKKOS_ENABLE_PRAGMA_UNROLL KOKKOS_HAVE_PRAGMA_UNROLL
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_PRAGMA_VECTOR
+#ifndef KOKKOS_ENABLE_PRAGMA_VECTOR
+#define KOKKOS_ENABLE_PRAGMA_VECTOR KOKKOS_HAVE_PRAGMA_VECTOR
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_PTHREAD
+#ifndef KOKKOS_ENABLE_PTHREAD
+#define KOKKOS_ENABLE_PTHREAD KOKKOS_HAVE_PTHREAD
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_QTHREADS
+#ifndef KOKKOS_ENABLE_QTHREADS
+#define KOKKOS_ENABLE_QTHREADS KOKKOS_HAVE_QTHREADS
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_SERIAL
+#ifndef KOKKOS_ENABLE_SERIAL
+#define KOKKOS_ENABLE_SERIAL KOKKOS_HAVE_SERIAL
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_TYPE
+#ifndef KOKKOS_IMPL_HAS_TYPE
+#define KOKKOS_IMPL_HAS_TYPE KOKKOS_HAVE_TYPE
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_WINTHREAD
+#ifndef KOKKOS_ENABLE_WINTHREAD
+#define KOKKOS_ENABLE_WINTHREAD KOKKOS_HAVE_WINTHREAD
+#endif
+#endif
+
+#ifdef KOKKOS_HAVE_Winthread
+#ifndef KOKKOS_ENABLE_WINTHREAD
+#define KOKKOS_ENABLE_WINTHREAD KOKKOS_HAVE_Winthread
+#endif
+#endif
+
+#ifdef KOKKOS_INTEL_MM_ALLOC_AVAILABLE
+#ifndef KOKKOS_ENABLE_INTEL_MM_ALLOC
+#define KOKKOS_ENABLE_INTEL_MM_ALLOC KOKKOS_INTEL_MM_ALLOC_AVAILABLE
+#endif
+#endif
+
+#ifdef KOKKOS_MACRO_IMPL_TO_STRING
+#ifndef KOKKOS_IMPL_MACRO_TO_STRING
+#define KOKKOS_IMPL_MACRO_TO_STRING KOKKOS_MACRO_IMPL_TO_STRING
+#endif
+#endif
+
+#ifdef KOKKOS_MACRO_TO_STRING
+#ifndef KOKKOS_MACRO_TO_STRING
+#define KOKKOS_MACRO_TO_STRING KOKKOS_MACRO_TO_STRING
+#endif
+#endif
+
+#ifdef KOKKOS_MAY_ALIAS
+#ifndef KOKKOS_IMPL_MAY_ALIAS
+#define KOKKOS_IMPL_MAY_ALIAS KOKKOS_MAY_ALIAS
+#endif
+#endif
+
+#ifdef KOKKOS_MDRANGE_IVDEP
+#ifndef KOKKOS_IMPL_MDRANGE_IVDEP
+#define KOKKOS_IMPL_MDRANGE_IVDEP KOKKOS_MDRANGE_IVDEP
+#endif
+#endif
+
+
+#ifdef KOKKOS_MEMPOOL_PRINTERR
+#ifndef KOKKOS_ENABLE_MEMPOOL_PRINTERR
+#define KOKKOS_ENABLE_MEMPOOL_PRINTERR KOKKOS_MEMPOOL_PRINTERR
+#endif
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS
+#ifndef KOKKOS_ENABLE_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS
+#define KOKKOS_ENABLE_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS KOKKOS_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS
+#endif
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
+#ifndef KOKKOS_ENABLE_MEMPOOL_PRINT_BLOCKSIZE_INFO
+#define KOKKOS_ENABLE_MEMPOOL_PRINT_BLOCKSIZE_INFO KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
+#endif
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_CONSTRUCTOR_INFO
+#ifndef KOKKOS_ENABLE_MEMPOOL_PRINT_CONSTRUCTOR_INFO
+#define KOKKOS_ENABLE_MEMPOOL_PRINT_CONSTRUCTOR_INFO KOKKOS_MEMPOOL_PRINT_CONSTRUCTOR_INFO
+#endif
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
+#ifndef KOKKOS_ENABLE_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
+#define KOKKOS_ENABLE_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
+#endif
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_INFO
+#ifndef KOKKOS_ENABLE_MEMPOOL_PRINT_INFO
+#define KOKKOS_ENABLE_MEMPOOL_PRINT_INFO KOKKOS_MEMPOOL_PRINT_INFO
+#endif
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_PAGE_INFO
+#ifndef KOKKOS_ENABLE_MEMPOOL_PRINT_PAGE_INFO
+#define KOKKOS_ENABLE_MEMPOOL_PRINT_PAGE_INFO KOKKOS_MEMPOOL_PRINT_PAGE_INFO
+#endif
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+#ifndef KOKKOS_ENABLE_MEMPOOL_PRINT_SUPERBLOCK_INFO
+#define KOKKOS_ENABLE_MEMPOOL_PRINT_SUPERBLOCK_INFO KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+#endif
+#endif
+
+#ifdef KOKKOS_POSIX_MEMALIGN_AVAILABLE
+#ifndef KOKKOS_ENABLE_POSIX_MEMALIGN
+#define KOKKOS_ENABLE_POSIX_MEMALIGN KOKKOS_POSIX_MEMALIGN_AVAILABLE
+#endif
+#endif
+
+#ifdef KOKKOS_POSIX_MMAP_FLAGS
+#ifndef KOKKOS_IMPL_POSIX_MMAP_FLAGS
+#define KOKKOS_IMPL_POSIX_MMAP_FLAGS KOKKOS_POSIX_MMAP_FLAGS
+#endif
+#endif
+
+#ifdef KOKKOS_POSIX_MMAP_FLAGS_HUGE
+#ifndef KOKKOS_IMPL_POSIX_MMAP_FLAGS_HUGE
+#define KOKKOS_IMPL_POSIX_MMAP_FLAGS_HUGE KOKKOS_POSIX_MMAP_FLAGS_HUGE
+#endif
+#endif
+
+#ifdef KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
+#ifndef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT
+#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
+#endif
+#endif
+
+#ifdef KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
+#ifndef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_ENABLED
+#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_ENABLED KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
+#endif
+#endif
+
+#ifdef KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
+#ifndef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT
+#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
+#endif
+#endif
+
+#ifdef KOKKOS_USE_CUDA_UVM
+#ifndef KOKKOS_ENABLE_CUDA_UVM
+#define KOKKOS_ENABLE_CUDA_UVM KOKKOS_USE_CUDA_UVM
+#endif
+#endif
+
+#ifdef KOKKOS_USE_ISA_KNC
+#ifndef KOKKOS_ENABLE_ISA_KNC
+#define KOKKOS_ENABLE_ISA_KNC KOKKOS_USE_ISA_KNC
+#endif
+#endif
+
+#ifdef KOKKOS_USE_ISA_POWERPCLE
+#ifndef KOKKOS_ENABLE_ISA_POWERPCLE
+#define KOKKOS_ENABLE_ISA_POWERPCLE KOKKOS_USE_ISA_POWERPCLE
+#endif
+#endif
+
+#ifdef KOKKOS_USE_ISA_X86_64
+#ifndef KOKKOS_ENABLE_ISA_X86_64
+#define KOKKOS_ENABLE_ISA_X86_64 KOKKOS_USE_ISA_X86_64
+#endif
+#endif
+
+#ifdef KOKKOS_USE_LIBRT
+#ifndef KOKKOS_ENABLE_LIBRT
+#define KOKKOS_ENABLE_LIBRT KOKKOS_USE_LIBRT
+#endif
+#endif
+
+#ifdef KOKKOS_VIEW_OPERATOR_VERIFY
+#ifndef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY
+#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY KOKKOS_VIEW_OPERATOR_VERIFY
+#endif
+#endif
+
+#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD )
+#ifndef KOKKOS_ENABLE_THREADS
+#define KOKKOS_ENABLE_THREADS
+#endif
+#endif
+
+//------------------------------------------------------------------------------
+// Deprecated macros
+//------------------------------------------------------------------------------
+#ifdef KOKKOS_HAVE_CXX11
+#undef KOKKOS_HAVE_CXX11
+#endif
+#ifdef KOKKOS_ENABLE_CXX11
+#undef KOKKOS_ENABLE_CXX11
+#endif
+#ifdef KOKKOS_USING_EXP_VIEW
+#undef KOKKOS_USING_EXP_VIEW
+#endif
+#ifdef KOKKOS_USING_EXPERIMENTAL_VIEW
+#undef KOKKOS_USING_EXPERIMENTAL_VIEW
+#endif
+
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_CXX11 1
+#define KOKKOS_USING_EXP_VIEW 1
+#define KOKKOS_USING_EXPERIMENTAL_VIEW 1
+
+#endif //KOKKOS_IMPL_OLD_MACROS_HPP
diff --git a/packages/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp b/packages/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..919f6769452a7378794b9c3b2050da27c52c8ed8
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
@@ -0,0 +1,72 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
+#define KOKKOS_PHYSICAL_LAYOUT_HPP
+
+#include <Kokkos_View.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+struct PhysicalLayout {
+  enum LayoutType {Left,Right,Scalar,Error};
+  LayoutType layout_type;
+  int rank;
+  long long int stride[9]; //distance between two neighboring elements in a given dimension
+
+  template< class T , class L , class D , class M >
+  PhysicalLayout( const View<T,L,D,M> & view )
+    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
+                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
+    , rank( view.Rank )
+    {
+      for(int i=0;i<9;i++) stride[i] = 0;
+      view.stride( stride );
+    }
+};
+
+}
+}
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f2f2bc3943369d1689f03cbdeb1dc290dbc1f24
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
@@ -0,0 +1,59 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+*/
+
+#ifndef KOKKOSP_DEVICE_INFO_HPP
+#define KOKKOSP_DEVICE_INFO_HPP
+
+#include <cstdint>
+
+namespace Kokkos {
+namespace Profiling {
+
+struct KokkosPDeviceInfo {
+  uint32_t deviceID;
+};
+
+}
+}
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..51c94781505a09e42d81fb3d0a40a00d45598b25
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@@ -0,0 +1,381 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+ */
+
+#include <Kokkos_Macros.hpp>
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <cstring>
+
+namespace Kokkos {
+namespace Profiling {
+
+static initFunction initProfileLibrary = nullptr;
+static finalizeFunction finalizeProfileLibrary = nullptr;
+
+static beginFunction beginForCallee = nullptr;
+static beginFunction beginScanCallee = nullptr;
+static beginFunction beginReduceCallee = nullptr;
+static endFunction endForCallee = nullptr;
+static endFunction endScanCallee = nullptr;
+static endFunction endReduceCallee = nullptr;
+
+static pushFunction pushRegionCallee = nullptr;
+static popFunction popRegionCallee = nullptr;
+
+static allocateDataFunction allocateDataCallee = nullptr;
+static deallocateDataFunction deallocateDataCallee = nullptr;
+
+static beginDeepCopyFunction beginDeepCopyCallee = nullptr;
+static endDeepCopyFunction endDeepCopyCallee = nullptr;
+
+static createProfileSectionFunction createSectionCallee = nullptr;
+static startProfileSectionFunction startSectionCallee = nullptr;
+static stopProfileSectionFunction stopSectionCallee = nullptr;
+static destroyProfileSectionFunction destroySectionCallee = nullptr;
+
+static profileEventFunction profileEventCallee = nullptr;
+
+SpaceHandle::SpaceHandle(const char* space_name) {
+  strncpy(name,space_name,64);
+}
+
+bool profileLibraryLoaded() {
+  return (nullptr != initProfileLibrary);
+}
+
+void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+  if(nullptr != beginForCallee) {
+    Kokkos::fence();
+    (*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
+  }
+}
+
+void endParallelFor(const uint64_t kernelID) {
+  if(nullptr != endForCallee) {
+    Kokkos::fence();
+    (*endForCallee)(kernelID);
+  }
+}
+
+void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+  if(nullptr != beginScanCallee) {
+    Kokkos::fence();
+    (*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
+  }
+}
+
+void endParallelScan(const uint64_t kernelID) {
+  if(nullptr != endScanCallee) {
+    Kokkos::fence();
+    (*endScanCallee)(kernelID);
+  }
+}
+
+void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+  if(nullptr != beginReduceCallee) {
+    Kokkos::fence();
+    (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
+  }
+}
+
+void endParallelReduce(const uint64_t kernelID) {
+  if(nullptr != endReduceCallee) {
+    Kokkos::fence();
+    (*endReduceCallee)(kernelID);
+  }
+}
+
+
+void pushRegion(const std::string& kName) {
+  if( nullptr != pushRegionCallee ) {
+    Kokkos::fence();
+    (*pushRegionCallee)(kName.c_str());
+  }
+}
+
+void popRegion() {
+  if( nullptr != popRegionCallee ) {
+    Kokkos::fence();
+    (*popRegionCallee)();
+  }
+}
+
+void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
+  if(nullptr != allocateDataCallee) {
+    (*allocateDataCallee)(space,label.c_str(),ptr,size);
+  }
+}
+
+void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
+  if(nullptr != deallocateDataCallee) {
+    (*deallocateDataCallee)(space,label.c_str(),ptr,size);
+  }
+}
+
+void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr,
+    const SpaceHandle src_space, const std::string src_label, const void* src_ptr,
+    const uint64_t size) {
+  if(nullptr != beginDeepCopyCallee) {
+    (*beginDeepCopyCallee)(dst_space, dst_label.c_str(), dst_ptr,
+                      src_space, src_label.c_str(), src_ptr,
+                      size);
+  }
+}
+
+void endDeepCopy() {
+  if(nullptr != endDeepCopyCallee) {
+    (*endDeepCopyCallee)();
+  }
+}
+
+void createProfileSection(const std::string& sectionName, uint32_t* secID) {
+
+	if(nullptr != createSectionCallee) {
+		(*createSectionCallee)(sectionName.c_str(), secID);
+	}
+}
+
+void startSection(const uint32_t secID) {
+	if(nullptr != startSectionCallee) {
+		(*startSectionCallee)(secID);
+	}
+}
+
+void stopSection(const uint32_t secID) {
+	if(nullptr != stopSectionCallee) {
+		(*stopSectionCallee)(secID);
+	}
+}
+
+void destroyProfileSection(const uint32_t secID) {
+	if(nullptr != destroySectionCallee) {
+		(*destroySectionCallee)(secID);
+	}
+}
+
+void markEvent(const std::string& eventName) {
+	if(nullptr != profileEventCallee) {
+		(*profileEventCallee)(eventName.c_str());
+	}
+}
+
+void initialize() {
+
+  // Make sure initialize calls happens only once
+  static int is_initialized = 0;
+  if(is_initialized) return;
+  is_initialized = 1;
+
+  void* firstProfileLibrary;
+
+  char* envProfileLibrary  = getenv("KOKKOS_PROFILE_LIBRARY");
+
+  // If we do not find a profiling library in the environment then exit
+  // early.
+  if( nullptr == envProfileLibrary ) {
+    return ;
+  }
+
+  char* envProfileCopy = (char*) malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
+  sprintf(envProfileCopy, "%s", envProfileLibrary);
+
+  char* profileLibraryName = strtok(envProfileCopy, ";");
+
+  if( (nullptr != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
+    firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
+
+    if(nullptr == firstProfileLibrary) {
+      std::cerr << "Error: Unable to load KokkosP library: " <<
+        profileLibraryName << std::endl;
+      std::cerr << "dlopen(" << profileLibraryName << ", RTLD_NOW | RTLD_GLOBAL) failed with "
+        << dlerror() << '\n';
+    } else {
+#ifdef KOKKOS_ENABLE_PROFILING_LOAD_PRINT
+      std::cout << "KokkosP: Library Loaded: " << profileLibraryName << std::endl;
+#endif
+
+      // dlsym returns a pointer to an object, while we want to assign to pointer to function
+      // A direct cast will give warnings hence, we have to workaround the issue by casting pointer to pointers.
+      auto p1 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
+      beginForCallee = *((beginFunction*) &p1);
+      auto p2 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
+      beginScanCallee = *((beginFunction*) &p2);
+      auto p3 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
+      beginReduceCallee = *((beginFunction*) &p3);
+
+      auto p4 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
+      endScanCallee = *((endFunction*) &p4);
+      auto p5 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
+      endForCallee = *((endFunction*) &p5);
+      auto p6 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
+      endReduceCallee = *((endFunction*) &p6);
+
+      auto p7 = dlsym(firstProfileLibrary, "kokkosp_init_library");
+      initProfileLibrary = *((initFunction*) &p7);
+      auto p8 = dlsym(firstProfileLibrary, "kokkosp_finalize_library");
+      finalizeProfileLibrary = *((finalizeFunction*) &p8);
+
+      auto p9 = dlsym(firstProfileLibrary, "kokkosp_push_profile_region");
+      pushRegionCallee = *((pushFunction*) &p9);
+      auto p10 = dlsym(firstProfileLibrary, "kokkosp_pop_profile_region");
+      popRegionCallee = *((popFunction*) &p10);
+
+      auto p11 = dlsym(firstProfileLibrary, "kokkosp_allocate_data");
+      allocateDataCallee = *((allocateDataFunction*) &p11);
+      auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
+      deallocateDataCallee = *((deallocateDataFunction*) &p12);
+
+      auto p13 = dlsym(firstProfileLibrary, "kokkosp_begin_deep_copy");
+      beginDeepCopyCallee = *((beginDeepCopyFunction*) &p13);
+      auto p14 = dlsym(firstProfileLibrary, "kokkosp_end_deep_copy");
+      endDeepCopyCallee = *((endDeepCopyFunction*) &p14);
+      
+      auto p15 = dlsym(firstProfileLibrary, "kokkosp_create_profile_section");
+      createSectionCallee = *((createProfileSectionFunction*) &p15);
+      auto p16 = dlsym(firstProfileLibrary, "kokkosp_start_profile_section");
+      startSectionCallee = *((startProfileSectionFunction*) &p16);
+      auto p17 = dlsym(firstProfileLibrary, "kokkosp_stop_profile_section");
+      stopSectionCallee = *((stopProfileSectionFunction*) &p17);      
+      auto p18 = dlsym(firstProfileLibrary, "kokkosp_destroy_profile_section");
+      destroySectionCallee = *((destroyProfileSectionFunction*) &p18);
+      
+      auto p19 = dlsym(firstProfileLibrary, "kokkosp_profile_event");
+      profileEventCallee = *((profileEventFunction*) &p19);
+    }
+  }
+
+  if(nullptr != initProfileLibrary) {
+    (*initProfileLibrary)(0,
+        (uint64_t) KOKKOSP_INTERFACE_VERSION,
+        (uint32_t) 0,
+        nullptr);
+  }
+
+  free(envProfileCopy);
+}
+
+void finalize() {
+  // Make sure finalize calls happens only once
+  static int is_finalized = 0;
+  if(is_finalized) return;
+  is_finalized = 1;
+
+  if(nullptr != finalizeProfileLibrary) {
+    (*finalizeProfileLibrary)();
+
+    // Set all profile hooks to nullptr to prevent
+    // any additional calls. Once we are told to
+    // finalize, we mean it
+    initProfileLibrary = nullptr;
+    finalizeProfileLibrary = nullptr;
+
+    beginForCallee = nullptr;
+    beginScanCallee = nullptr;
+    beginReduceCallee = nullptr;
+    endScanCallee = nullptr;
+    endForCallee = nullptr;
+    endReduceCallee = nullptr;
+
+    pushRegionCallee = nullptr;
+    popRegionCallee = nullptr;
+
+    allocateDataCallee = nullptr;
+    deallocateDataCallee = nullptr;
+
+    beginDeepCopyCallee = nullptr;
+    endDeepCopyCallee = nullptr;
+    
+    createSectionCallee = nullptr;
+	startSectionCallee = nullptr;
+	stopSectionCallee = nullptr;
+	destroySectionCallee = nullptr;
+
+	profileEventCallee = nullptr;
+  }
+}
+}
+}
+
+#else
+
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <cstring>
+
+namespace Kokkos {
+namespace Profiling {
+
+bool profileLibraryLoaded() { return false; }
+
+
+void beginParallelFor(const std::string& , const uint32_t , uint64_t* ) {}
+void endParallelFor(const uint64_t ) {}
+void beginParallelScan(const std::string& , const uint32_t , uint64_t* ) {}
+void endParallelScan(const uint64_t ) {}
+void beginParallelReduce(const std::string& , const uint32_t , uint64_t* ) {}
+void endParallelReduce(const uint64_t ) {}
+
+void pushRegion(const std::string& ) {}
+void popRegion() {}
+void createProfileSection(const std::string& , uint32_t* ) {}
+void startSection(const uint32_t ) {}
+void stopSection(const uint32_t ) {}
+void destroyProfileSection(const uint32_t ) {}
+
+void markEvent(const std::string& ) {}
+
+void allocateData(const SpaceHandle , const std::string , const void* , const uint64_t ) {}
+void deallocateData(const SpaceHandle , const std::string , const void* , const uint64_t ) {}
+
+void beginDeepCopy(const SpaceHandle , const std::string , const void* , 
+    const SpaceHandle , const std::string , const void* ,
+    const uint64_t ) {}
+void endDeepCopy() {}
+
+void initialize() {}
+void finalize() {}
+
+}} // end namespace Kokkos::Profiling
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b245db09a723ffe62a9d24d0455c64ba5d02506c
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -0,0 +1,177 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+ */
+
+#ifndef KOKKOSP_INTERFACE_HPP
+#define KOKKOSP_INTERFACE_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cinttypes>
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <string>
+
+#include <iostream>
+#include <cstdlib>
+
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <dlfcn.h>
+
+#include <impl/Kokkos_Profiling_DeviceInfo.hpp>
+
+#define KOKKOSP_INTERFACE_VERSION 20171029
+
+namespace Kokkos {
+namespace Profiling {
+
+struct SpaceHandle {
+  SpaceHandle(const char* space_name);
+  char name[64];
+};
+
+typedef void (*initFunction)(const int,
+                             const uint64_t,
+                             const uint32_t,
+                             KokkosPDeviceInfo*);
+typedef void (*finalizeFunction)();
+typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*);
+typedef void (*endFunction)(uint64_t);
+
+typedef void (*pushFunction)(const char*);
+typedef void (*popFunction)();
+
+typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
+typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
+
+typedef void (*createProfileSectionFunction)(const char*, uint32_t*);
+typedef void (*startProfileSectionFunction)(const uint32_t);
+typedef void (*stopProfileSectionFunction)(const uint32_t);
+typedef void (*destroyProfileSectionFunction)(const uint32_t);
+
+typedef void (*profileEventFunction)(const char*);
+
+typedef void (*beginDeepCopyFunction)(
+    SpaceHandle, const char*, const void*,
+    SpaceHandle, const char*, const void*,
+    uint64_t);
+typedef void (*endDeepCopyFunction)();
+
+bool profileLibraryLoaded();
+
+void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+void endParallelFor(const uint64_t kernelID);
+void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+void endParallelScan(const uint64_t kernelID);
+void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+void endParallelReduce(const uint64_t kernelID);
+
+void pushRegion(const std::string& kName);
+void popRegion();
+
+void createProfileSection(const std::string& sectionName, uint32_t* secID);
+void startSection(const uint32_t secID);
+void stopSection(const uint32_t secID);
+void destroyProfileSection(const uint32_t secID);
+
+void markEvent(const std::string* evName);
+
+void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
+void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
+
+void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr,
+    const SpaceHandle src_space, const std::string src_label, const void* src_ptr,
+    const uint64_t size);
+void endDeepCopy();
+
+void initialize();
+void finalize();
+
+}
+}
+
+#else
+namespace Kokkos {
+namespace Profiling {
+
+struct SpaceHandle {
+  SpaceHandle(const char* space_name);
+  char name[64];
+};
+
+
+bool profileLibraryLoaded();
+
+
+void beginParallelFor(const std::string& , const uint32_t , uint64_t* );
+void endParallelFor(const uint64_t );
+void beginParallelScan(const std::string& , const uint32_t , uint64_t* );
+void endParallelScan(const uint64_t );
+void beginParallelReduce(const std::string& , const uint32_t , uint64_t* );
+void endParallelReduce(const uint64_t );
+
+void pushRegion(const std::string& );
+void popRegion();
+void createProfileSection(const std::string& , uint32_t* );
+void startSection(const uint32_t );
+void stopSection(const uint32_t );
+void destroyProfileSection(const uint32_t );
+
+void markEvent(const std::string& );
+
+void allocateData(const SpaceHandle , const std::string , const void* , const uint64_t );
+void deallocateData(const SpaceHandle , const std::string , const void* , const uint64_t );
+
+void beginDeepCopy(const SpaceHandle , const std::string , const void* , 
+    const SpaceHandle , const std::string , const void* ,
+    const uint64_t );
+void endDeepCopy();
+
+void initialize();
+void finalize();
+
+}
+}
+
+#endif
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial.cpp b/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3fcf778a68f443674e53d8370a2547f7201e8dd
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -0,0 +1,195 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_SERIAL )
+
+#include <cstdlib>
+#include <sstream>
+#include <Kokkos_Serial.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#include <impl/Kokkos_SharedAlloc.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+HostThreadTeamData g_serial_thread_team_data ;
+
+bool g_serial_is_initialized = false;
+
+}
+
+// Resize thread team data scratch memory
+void serial_resize_thread_team_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes )
+{
+  if ( pool_reduce_bytes < 512 ) pool_reduce_bytes = 512 ;
+  if ( team_reduce_bytes < 512 ) team_reduce_bytes = 512 ;
+
+  const size_t old_pool_reduce  = g_serial_thread_team_data.pool_reduce_bytes();
+  const size_t old_team_reduce  = g_serial_thread_team_data.team_reduce_bytes();
+  const size_t old_team_shared  = g_serial_thread_team_data.team_shared_bytes();
+  const size_t old_thread_local = g_serial_thread_team_data.thread_local_bytes();
+  const size_t old_alloc_bytes  = g_serial_thread_team_data.scratch_bytes();
+
+  // Allocate if any of the old allocation is tool small:
+
+  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
+                        ( old_team_reduce  < team_reduce_bytes ) ||
+                        ( old_team_shared  < team_shared_bytes ) ||
+                        ( old_thread_local < thread_local_bytes );
+
+  if ( allocate ) {
+
+    Kokkos::HostSpace space ;
+
+    if ( old_alloc_bytes ) {
+      g_serial_thread_team_data.disband_team();
+      g_serial_thread_team_data.disband_pool();
+
+      space.deallocate( g_serial_thread_team_data.scratch_buffer()
+                      , g_serial_thread_team_data.scratch_bytes() );
+    }
+
+    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
+    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
+    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
+    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
+
+    const size_t alloc_bytes =
+      HostThreadTeamData::scratch_size( pool_reduce_bytes
+                                      , team_reduce_bytes
+                                      , team_shared_bytes
+                                      , thread_local_bytes );
+
+    void * const ptr = space.allocate( alloc_bytes );
+
+    g_serial_thread_team_data.
+      scratch_assign( ((char *)ptr)
+                    , alloc_bytes
+                    , pool_reduce_bytes
+                    , team_reduce_bytes
+                    , team_shared_bytes
+                    , thread_local_bytes );
+
+    HostThreadTeamData * pool[1] = { & g_serial_thread_team_data };
+
+    g_serial_thread_team_data.organize_pool( pool , 1 );
+    g_serial_thread_team_data.organize_team(1);
+  }
+}
+
+HostThreadTeamData * serial_get_thread_team_data()
+{
+  return & g_serial_thread_team_data ;
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+bool Serial::is_initialized()
+{
+  return Impl::g_serial_is_initialized ;
+}
+
+void Serial::initialize( unsigned threads_count
+                       , unsigned use_numa_count
+                       , unsigned use_cores_per_numa
+                       , bool allow_asynchronous_threadpool )
+{
+  (void) threads_count;
+  (void) use_numa_count;
+  (void) use_cores_per_numa;
+  (void) allow_asynchronous_threadpool;
+
+  Impl::SharedAllocationRecord< void, void >::tracking_enable();
+
+  // Init the array of locks used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+
+  Impl::g_serial_is_initialized = true;
+}
+
+void Serial::finalize()
+{
+  if ( Impl::g_serial_thread_team_data.scratch_buffer() ) {
+    Impl::g_serial_thread_team_data.disband_team();
+    Impl::g_serial_thread_team_data.disband_pool();
+
+    Kokkos::HostSpace space ;
+
+    space.deallocate( Impl::g_serial_thread_team_data.scratch_buffer()
+                    , Impl::g_serial_thread_team_data.scratch_bytes() );
+
+    Impl::g_serial_thread_team_data.scratch_assign( (void*) 0, 0, 0, 0, 0, 0 );
+  }
+
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
+
+  Impl::g_serial_is_initialized = false;
+}
+
+const char* Serial::name() { return "Serial"; }
+
+} // namespace Kokkos
+
+#else
+void KOKKOS_CORE_SRC_IMPL_SERIAL_PREVENT_LINK_ERROR() {}
+#endif // defined( KOKKOS_ENABLE_SERIAL )
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.cpp b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b85909ed53feeb3edb5d2da479ca0a6c8e24682
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_SERIAL ) && defined( KOKKOS_ENABLE_TASKDAG )
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_Serial_Task.hpp>
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Serial > ;
+
+void TaskQueueSpecialization< Kokkos::Serial >::execute
+  ( TaskQueue< Kokkos::Serial > * const queue )
+{
+  using execution_space = Kokkos::Serial ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< void , void , void > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  // Set default buffers
+  serial_resize_thread_team_data( 0   /* global reduce buffer */
+                                , 512 /* team reduce buffer */
+                                , 0   /* team shared buffer */
+                                , 0   /* thread local buffer */
+                                );
+
+  Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
+
+  Member exec( *data );
+
+  // Loop until all queues are empty
+  while ( 0 < queue->m_ready_count ) {
+
+    task_root_type * task = end ;
+
+    for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+      for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+        task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
+      }
+    }
+
+    if ( end != task ) {
+
+      // pop_ready_task resulted in lock == task->m_next
+      // In the executing state
+
+      (*task->m_apply)( task , & exec );
+
+#if 0
+  printf( "TaskQueue<Serial>::executed: 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+      // If a respawn then re-enqueue otherwise the task is complete
+      // and all tasks waiting on this task are updated.
+      queue->complete( task );
+    }
+    else if ( 0 != queue->m_ready_count ) {
+      Kokkos::abort("TaskQueue<Serial>::execute ERROR: ready_count");
+    }
+  }
+}
+
+void TaskQueueSpecialization< Kokkos::Serial > ::
+  iff_single_thread_recursive_execute(
+    TaskQueue< Kokkos::Serial > * const queue )
+{
+  using execution_space = Kokkos::Serial ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< void , void , void > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
+
+  Member exec( *data );
+
+  // Loop until no runnable task
+
+  task_root_type * task = end ;
+
+  do {
+
+    task = end ;
+
+    for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+      for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+        task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
+      }
+    }
+
+    if ( end == task ) break ;
+
+    (*task->m_apply)( task , & exec );
+
+    queue->complete( task );
+
+  } while(1);
+}
+
+}} /* namespace Kokkos::Impl */
+
+#else
+void KOKKOS_CORE_SRC_IMPL_SERIAL_TASK_PREVENT_LINK_ERROR() {}
+#endif /* #if defined( KOKKOS_ENABLE_SERIAL ) && defined( KOKKOS_ENABLE_TASKDAG ) */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2fec5dfb897bb4c4e86a3976076be44cfdf5371e
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@@ -0,0 +1,92 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_SERIAL_TASK_HPP
+#define KOKKOS_IMPL_SERIAL_TASK_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+#include <impl/Kokkos_TaskQueue.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskQueueSpecialization< Kokkos::Serial >
+{
+public:
+
+  using execution_space = Kokkos::Serial ;
+  using memory_space    = Kokkos::HostSpace ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< void , void , void > ;
+  using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  static
+  void execute( queue_type * const );
+
+  template< typename TaskType >
+  static
+  typename TaskType::function_type
+  get_function_pointer() { return TaskType::apply ; }
+};
+
+extern template class TaskQueue< Kokkos::Serial > ;
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
+#endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp b/packages/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..07a207b556316f34af0cd514986668c0809cfd33
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
@@ -0,0 +1,101 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP
+#define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::Serial
+                 >
+{
+private:
+
+  typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
+
+  Policy       m_policy ;
+  FunctorType  m_functor ;
+
+  template< class TagType >
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one( const std::int32_t w ) const noexcept
+    { m_functor( w ); }
+
+  template< class TagType >
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one( const std::int32_t w ) const noexcept
+    { const TagType t{}; m_functor( t , w ); }
+
+public:
+
+  inline
+  void execute() const noexcept
+    {
+      // Spin until COMPLETED_TOKEN.
+      // END_TOKEN indicates no work is currently available.
+        
+      for ( std::int32_t w = Policy::END_TOKEN ;
+            Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
+        if ( Policy::END_TOKEN != w ) {
+          exec_one< typename Policy::work_tag >( w );
+          m_policy.completed_work(w);
+        }
+      }
+    }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_policy( arg_policy )
+    , m_functor( arg_functor )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP */
diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4452bb42731cf797853e0b019bb74c60f35367e6
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
@@ -0,0 +1,326 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+__thread int SharedAllocationRecord<void, void>::t_tracking_enabled = 1;
+
+bool
+SharedAllocationRecord< void , void >::
+is_sane( SharedAllocationRecord< void , void > * arg_record )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  SharedAllocationRecord * const root = arg_record ? arg_record->m_root : 0 ;
+
+  bool ok = root != 0 && root->use_count() == 0 ;
+
+  if ( ok ) {
+    SharedAllocationRecord * root_next = 0 ;
+
+    // Lock the list:
+    while ( ( root_next = Kokkos::atomic_exchange( & root->m_next , zero ) ) == zero );
+
+    for ( SharedAllocationRecord * rec = root_next ; ok && rec != root ; rec = rec->m_next ) {
+      const bool ok_non_null  = rec && rec->m_prev && ( rec == root || rec->m_next );
+      const bool ok_root      = ok_non_null && rec->m_root == root ;
+      const bool ok_prev_next = ok_non_null && ( rec->m_prev != root ? rec->m_prev->m_next == rec : root_next == rec );
+      const bool ok_next_prev = ok_non_null && rec->m_next->m_prev == rec ;
+      const bool ok_count     = ok_non_null && 0 <= rec->use_count() ;
+
+      ok = ok_root && ok_prev_next && ok_next_prev && ok_count ;
+
+if ( ! ok ) {
+  //Formatting dependent on sizeof(uintptr_t)
+  const char * format_string;
+
+  if (sizeof(uintptr_t) == sizeof(unsigned long)) {
+     format_string = "Kokkos::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12lx){ m_count(%d) m_root(0x%.12lx) m_next(0x%.12lx) m_prev(0x%.12lx) m_next->m_prev(0x%.12lx) m_prev->m_next(0x%.12lx) }\n";
+  }
+  else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
+     format_string = "Kokkos::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12llx){ m_count(%d) m_root(0x%.12llx) m_next(0x%.12llx) m_prev(0x%.12llx) m_next->m_prev(0x%.12llx) m_prev->m_next(0x%.12llx) }\n";
+  }
+
+  fprintf(stderr
+        , format_string
+        , reinterpret_cast< uintptr_t >( rec )
+        , rec->use_count()
+        , reinterpret_cast< uintptr_t >( rec->m_root )
+        , reinterpret_cast< uintptr_t >( rec->m_next )
+        , reinterpret_cast< uintptr_t >( rec->m_prev )
+        , reinterpret_cast< uintptr_t >( rec->m_next != NULL ? rec->m_next->m_prev : NULL )
+        , reinterpret_cast< uintptr_t >( rec->m_prev != rec->m_root ? rec->m_prev->m_next : root_next )
+        );
+}
+
+    }
+
+    if ( zero != Kokkos::atomic_exchange( & root->m_next , root_next ) ) {
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord failed is_sane unlocking");
+    }
+  }
+
+  return ok ;
+}
+
+SharedAllocationRecord<void,void> *
+SharedAllocationRecord<void,void>::find( SharedAllocationRecord<void,void> * const arg_root , void * const arg_data_ptr )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  SharedAllocationRecord * root_next = 0 ;
+
+  // Lock the list:
+  while ( ( root_next = Kokkos::atomic_exchange( & arg_root->m_next , zero ) ) == zero );
+
+  // Iterate searching for the record with this data pointer
+
+  SharedAllocationRecord * r = root_next ;
+
+  while ( ( r != arg_root ) && ( r->data() != arg_data_ptr ) ) { r = r->m_next ; }
+
+  if ( r == arg_root ) { r = 0 ; }
+
+  if ( zero != Kokkos::atomic_exchange( & arg_root->m_next , root_next ) ) {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord failed locking/unlocking");
+  }
+
+  return r ;
+}
+
+
+/**\brief  Construct and insert into 'arg_root' tracking set.
+ *         use_count is zero.
+ */
+SharedAllocationRecord< void , void >::
+SharedAllocationRecord( SharedAllocationRecord<void,void> * arg_root
+                      , SharedAllocationHeader            * arg_alloc_ptr
+                      , size_t                              arg_alloc_size
+                      , SharedAllocationRecord< void , void >::function_type  arg_dealloc
+                      )
+  : m_alloc_ptr(  arg_alloc_ptr )
+  , m_alloc_size( arg_alloc_size )
+  , m_dealloc(    arg_dealloc )
+  , m_root( arg_root )
+  , m_prev( 0 )
+  , m_next( 0 )
+  , m_count( 0 )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  if ( 0 != arg_alloc_ptr ) {
+
+    // Insert into the root double-linked list for tracking
+    //
+    // before:  arg_root->m_next == next ; next->m_prev == arg_root
+    // after:   arg_root->m_next == this ; this->m_prev == arg_root ;
+    //              this->m_next == next ; next->m_prev == this
+
+    m_prev = m_root ;
+
+    // Read root->m_next and lock by setting to zero
+    while ( ( m_next = Kokkos::atomic_exchange( & m_root->m_next , zero ) ) == zero );
+
+    m_next->m_prev = this ;
+
+    // memory fence before completing insertion into linked list
+    Kokkos::memory_fence();
+
+    if ( zero != Kokkos::atomic_exchange( & m_root->m_next , this ) ) {
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord failed locking/unlocking");
+    }
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord given NULL allocation");
+  }
+}
+
+void
+SharedAllocationRecord< void , void >::
+increment( SharedAllocationRecord< void , void > * arg_record )
+{
+  const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , 1 );
+
+  if ( old_count < 0 ) { // Error
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord failed increment");
+  }
+}
+
+SharedAllocationRecord< void , void > *
+SharedAllocationRecord< void , void >::
+decrement( SharedAllocationRecord< void , void > * arg_record )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , -1 );
+
+#if 0
+  if ( old_count <= 1 ) {
+    fprintf(stderr,"Kokkos::Impl::SharedAllocationRecord '%s' at 0x%lx delete count = %d\n", arg_record->m_alloc_ptr->m_label , (unsigned long) arg_record , old_count );
+    fflush(stderr);
+  }
+#endif
+
+
+  if ( old_count == 1 ) {
+
+    // before:  arg_record->m_prev->m_next == arg_record  &&
+    //          arg_record->m_next->m_prev == arg_record
+    //
+    // after:   arg_record->m_prev->m_next == arg_record->m_next  &&
+    //          arg_record->m_next->m_prev == arg_record->m_prev
+
+    SharedAllocationRecord * root_next = 0 ;
+
+    // Lock the list:
+    while ( ( root_next = Kokkos::atomic_exchange( & arg_record->m_root->m_next , zero ) ) == zero );
+
+    arg_record->m_next->m_prev = arg_record->m_prev ;
+
+    if ( root_next != arg_record ) {
+      arg_record->m_prev->m_next = arg_record->m_next ;
+    }
+    else {
+      // before:  arg_record->m_root == arg_record->m_prev
+      // after:   arg_record->m_root == arg_record->m_next
+      root_next = arg_record->m_next ;
+    }
+
+    Kokkos::memory_fence();
+
+    // Unlock the list:
+    if ( zero != Kokkos::atomic_exchange( & arg_record->m_root->m_next , root_next ) ) {
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord failed decrement unlocking");
+    }
+
+    arg_record->m_next = 0 ;
+    arg_record->m_prev = 0 ;
+
+    function_type d = arg_record->m_dealloc ;
+    (*d)( arg_record );
+    arg_record = 0 ;
+  }
+  else if ( old_count < 1 ) { // Error
+    fprintf(stderr,"Kokkos::Impl::SharedAllocationRecord '%s' failed decrement count = %d\n", arg_record->m_alloc_ptr->m_label , old_count );
+    fflush(stderr);
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord failed decrement count");
+  }
+
+  return arg_record ;
+}
+
+void
+SharedAllocationRecord< void , void >::
+print_host_accessible_records( std::ostream & s
+                             , const char * const space_name
+                             , const SharedAllocationRecord * const root
+                             , const bool detail )
+{
+  const SharedAllocationRecord< void , void > * r = root ;
+
+  char buffer[256] ;
+
+  if ( detail ) {
+    do {
+      //Formatting dependent on sizeof(uintptr_t)
+      const char * format_string;
+
+      if (sizeof(uintptr_t) == sizeof(unsigned long)) {
+        format_string = "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
+      }
+      else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
+        format_string = "%s addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
+      }
+
+      snprintf( buffer , 256
+              , format_string
+              , space_name
+              , reinterpret_cast<uintptr_t>( r )
+              , reinterpret_cast<uintptr_t>( r->m_prev )
+              , reinterpret_cast<uintptr_t>( r->m_next )
+              , reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
+              , r->m_alloc_size
+              , r->use_count()
+              , reinterpret_cast<uintptr_t>( r->m_dealloc )
+              , r->m_alloc_ptr->m_label
+              );
+      s << buffer ;
+      r = r->m_next ;
+    } while ( r != root );
+  }
+  else {
+    do {
+      if ( r->m_alloc_ptr ) {
+        //Formatting dependent on sizeof(uintptr_t)
+        const char * format_string;
+
+        if (sizeof(uintptr_t) == sizeof(unsigned long)) {
+          format_string = "%s [ 0x%.12lx + %ld ] %s\n";
+        }
+        else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
+          format_string = "%s [ 0x%.12llx + %ld ] %s\n";
+        }
+
+        snprintf( buffer , 256
+                , format_string
+                , space_name
+                , reinterpret_cast< uintptr_t >( r->data() )
+                , r->size()
+                , r->m_alloc_ptr->m_label
+                );
+      }
+      else {
+        snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name );
+      }
+      s << buffer ;
+      r = r->m_next ;
+    } while ( r != root );
+  }
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..27e3c35ec18a5a105898e75dda97792586629db5
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
@@ -0,0 +1,420 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SHARED_ALLOC_HPP
+#define KOKKOS_SHARED_ALLOC_HPP
+
+#include <cstdint>
+#include <string>
+
+namespace Kokkos {
+namespace Impl {
+
+template< class MemorySpace = void , class DestroyFunctor = void >
+class SharedAllocationRecord ;
+
+class SharedAllocationHeader {
+private:
+
+  typedef SharedAllocationRecord<void,void>  Record ;
+
+  static constexpr unsigned maximum_label_length = ( 1u << 7 /* 128 */ ) - sizeof(Record*);
+
+  template< class , class > friend class SharedAllocationRecord ;
+
+  Record * m_record ;
+  char     m_label[ maximum_label_length ];
+
+public:
+
+  /* Given user memory get pointer to the header */
+  KOKKOS_INLINE_FUNCTION static
+  const SharedAllocationHeader * get_header( void * alloc_ptr )
+    { return reinterpret_cast<SharedAllocationHeader*>( reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader) ); }
+
+  KOKKOS_INLINE_FUNCTION
+  const char* label() const { return m_label; }
+};
+
+template<>
+class SharedAllocationRecord< void , void > {
+protected:
+
+  static_assert( sizeof(SharedAllocationHeader) == ( 1u << 7 /* 128 */ ) , "sizeof(SharedAllocationHeader) != 128" );
+
+  template< class , class > friend class SharedAllocationRecord ;
+
+  typedef void (* function_type )( SharedAllocationRecord<void,void> * );
+
+  SharedAllocationHeader * const m_alloc_ptr ;
+  size_t                   const m_alloc_size ;
+  function_type            const m_dealloc ;
+  SharedAllocationRecord * const m_root ;
+  SharedAllocationRecord *       m_prev ;
+  SharedAllocationRecord *       m_next ;
+  int                            m_count ;
+
+  SharedAllocationRecord( SharedAllocationRecord && ) = delete ;
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( SharedAllocationRecord && ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  /**\brief  Construct and insert into 'arg_root' tracking set.
+   *         use_count is zero.
+   */
+  SharedAllocationRecord( SharedAllocationRecord * arg_root
+                        , SharedAllocationHeader * arg_alloc_ptr
+                        , size_t                   arg_alloc_size
+                        , function_type            arg_dealloc
+                        );
+private:
+  
+  static __thread int t_tracking_enabled;
+
+public:
+  inline std::string get_label() const { return std::string("Unmanaged"); }
+
+  static int tracking_enabled() { return t_tracking_enabled; }
+
+  /**\brief A host process thread claims and disables the
+   *        shared allocation tracking flag.
+   */
+  static void tracking_disable() { t_tracking_enabled = 0; }
+
+  /**\brief A host process thread releases and enables the
+   *        shared allocation tracking flag.
+   */
+  static void tracking_enable() { t_tracking_enabled = 1; }
+
+  ~SharedAllocationRecord() = default ;
+
+  SharedAllocationRecord()
+    : m_alloc_ptr( 0 )
+    , m_alloc_size( 0 )
+    , m_dealloc( 0 )
+    , m_root( this )
+    , m_prev( this )
+    , m_next( this )
+    , m_count( 0 )
+    {}
+
+  static constexpr unsigned maximum_label_length = SharedAllocationHeader::maximum_label_length ;
+
+  KOKKOS_INLINE_FUNCTION
+  const SharedAllocationHeader * head() const { return m_alloc_ptr ; }
+
+  /* User's memory begins at the end of the header */
+  KOKKOS_INLINE_FUNCTION
+  void * data() const { return reinterpret_cast<void*>( m_alloc_ptr + 1 ); }
+
+  /* User's memory begins at the end of the header */
+  size_t size() const { return m_alloc_size - sizeof(SharedAllocationHeader) ; }
+
+  /* Cannot be 'constexpr' because 'm_count' is volatile */
+  int use_count() const { return *static_cast<const volatile int *>(&m_count); }
+
+  /* Increment use count */
+  static void increment( SharedAllocationRecord * );
+
+  /* Decrement use count. If 1->0 then remove from the tracking list and invoke m_dealloc */
+  static SharedAllocationRecord * decrement( SharedAllocationRecord * );
+
+  /* Given a root record and data pointer find the record */
+  static SharedAllocationRecord * find( SharedAllocationRecord * const , void * const );
+
+  /*  Sanity check for the whole set of records to which the input record belongs.
+   *  Locks the set's insert/erase operations until the sanity check is complete.
+   */
+  static bool is_sane( SharedAllocationRecord * );
+
+  /*  Print host-accessible records */
+  static void print_host_accessible_records( std::ostream &
+                                           , const char * const space_name
+                                           , const SharedAllocationRecord * const root
+                                           , const bool detail );
+};
+
+namespace {
+
+/* Taking the address of this function so make sure it is unique */
+template < class MemorySpace , class DestroyFunctor >
+void deallocate( SharedAllocationRecord<void,void> * record_ptr )
+{
+  typedef SharedAllocationRecord< MemorySpace , void > base_type ;
+  typedef SharedAllocationRecord< MemorySpace , DestroyFunctor > this_type ;
+
+  this_type * const ptr = static_cast< this_type * >(
+                          static_cast< base_type * >( record_ptr ) );
+
+  ptr->m_destroy.destroy_shared_allocation();
+
+  delete ptr ;
+}
+
+}
+
+/*
+ *  Memory space specialization of SharedAllocationRecord< Space , void > requires :
+ *
+ *  SharedAllocationRecord< Space , void > : public SharedAllocationRecord< void , void >
+ *  {
+ *    // delete allocated user memory via static_cast to this type.
+ *    static void deallocate( const SharedAllocationRecord<void,void> * );
+ *    Space m_space ;
+ *  }
+ */
+template< class MemorySpace , class DestroyFunctor >
+class SharedAllocationRecord : public SharedAllocationRecord< MemorySpace , void >
+{
+private:
+
+  SharedAllocationRecord( const MemorySpace & arg_space
+                        , const std::string & arg_label
+                        , const size_t        arg_alloc
+                        )
+    /*  Allocate user memory as [ SharedAllocationHeader , user_memory ] */
+    : SharedAllocationRecord< MemorySpace , void >( arg_space , arg_label , arg_alloc , & Kokkos::Impl::deallocate< MemorySpace , DestroyFunctor > )
+    , m_destroy()
+    {}
+
+  SharedAllocationRecord() = delete ;
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+public:
+
+  DestroyFunctor  m_destroy ;
+
+  // Allocate with a zero use count.  Incrementing the use count from zero to one
+  // inserts the record into the tracking list.  Decrementing the count from one to zero
+  // removes from the trakcing list and deallocates.
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const MemorySpace & arg_space
+                                   , const std::string & arg_label
+                                   , const size_t        arg_alloc
+                                   )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc );
+#else
+      return (SharedAllocationRecord *) 0 ;
+#endif
+    }
+};
+
+template< class MemorySpace >
+class SharedAllocationRecord<MemorySpace,void> : public SharedAllocationRecord< void , void > {};
+
+union SharedAllocationTracker {
+private:
+
+  typedef SharedAllocationRecord<void,void>  Record ;
+
+  enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul };
+
+  // The allocation record resides in Host memory space
+  uintptr_t m_record_bits ;
+  Record  * m_record ;
+
+public:
+
+  // Use macros instead of inline functions to reduce
+  // pressure on compiler optimization by reducing
+  // number of symbols and inline functons.
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_ENABLED	\
+  Record::tracking_enabled()
+
+#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT	\
+  if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::increment( m_record );
+
+#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT	\
+  if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::decrement( m_record );
+
+#else
+
+#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_ENABLED  0
+
+#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT /* */
+
+#define KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT /* */
+
+#endif
+
+#define KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS(rhs, override_tracking) \
+  (((!override_tracking) || (rhs.m_record_bits & DO_NOT_DEREF_FLAG) \
+    || (!KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_ENABLED)) \
+  ? rhs.m_record_bits | DO_NOT_DEREF_FLAG \
+  : rhs.m_record_bits)
+
+  /** \brief  Assign a specialized record */
+  inline
+  void assign_allocated_record_to_uninitialized( Record * arg_record )
+    {
+      if ( arg_record ) {
+        Record::increment( m_record = arg_record );
+      }
+      else {
+        m_record_bits = DO_NOT_DEREF_FLAG ;
+      }
+    }
+
+  template< class MemorySpace >
+  constexpr
+  SharedAllocationRecord< MemorySpace , void > *
+  get_record() const noexcept
+    {
+      return ( m_record_bits & DO_NOT_DEREF_FLAG )
+             ? (SharedAllocationRecord< MemorySpace,void>*) 0
+             : static_cast<SharedAllocationRecord<MemorySpace,void>*>(m_record);
+    }
+
+  template< class MemorySpace >
+  std::string get_label() const
+    {
+      return ( m_record_bits == DO_NOT_DEREF_FLAG )
+             ? std::string()
+             : reinterpret_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record_bits & ~DO_NOT_DEREF_FLAG )->get_label()
+             ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  int use_count() const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      Record * const tmp = reinterpret_cast<Record*>( m_record_bits & ~DO_NOT_DEREF_FLAG );
+      return ( tmp ? tmp->use_count() : 0 );
+#else
+      return 0 ;
+#endif
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  bool has_record() const {
+    return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void clear()
+    {
+      // If this is tracking then must decrement
+      KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT
+      // Reset to default constructed value.
+      m_record_bits = DO_NOT_DEREF_FLAG ;
+    }
+
+  // Copy:
+  KOKKOS_FORCEINLINE_FUNCTION
+  ~SharedAllocationTracker()
+    { KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  constexpr SharedAllocationTracker()
+    : m_record_bits( DO_NOT_DEREF_FLAG ) {}
+
+  // Move:
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  SharedAllocationTracker( SharedAllocationTracker && rhs )
+    : m_record_bits( rhs.m_record_bits )
+    { rhs.m_record_bits = DO_NOT_DEREF_FLAG ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  SharedAllocationTracker & operator = ( SharedAllocationTracker && rhs )
+    {
+      auto swap_tmp = m_record_bits;
+      m_record_bits = rhs.m_record_bits;
+      rhs.m_record_bits = swap_tmp;
+      return *this ;
+    }
+
+  // Copy:
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  SharedAllocationTracker( const SharedAllocationTracker & rhs )
+    : m_record_bits( KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS(rhs, true) )
+    {
+      KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT
+    }
+
+  /** \brief  Copy construction may disable tracking. */
+  KOKKOS_FORCEINLINE_FUNCTION
+  SharedAllocationTracker( const SharedAllocationTracker & rhs
+                         , const bool enable_tracking )
+    : m_record_bits( KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS(rhs, enable_tracking) )
+    { KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  SharedAllocationTracker & operator = ( const SharedAllocationTracker & rhs )
+    {
+      // If this is tracking then must decrement
+      KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT
+      m_record_bits = KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS(rhs, true);
+      KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT
+      return *this ;
+    }
+
+  /** \brief  Copy assignment may disable tracking */
+  KOKKOS_FORCEINLINE_FUNCTION
+  void assign( const SharedAllocationTracker & rhs
+             , const bool enable_tracking )
+    {
+      KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT
+      m_record_bits = KOKKOS_IMPL_SHARED_ALLOCATION_CARRY_RECORD_BITS(rhs, enable_tracking);
+      KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT
+    }
+
+#undef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_ENABLED
+#undef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_INCREMENT
+#undef KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT
+
+};
+
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp b/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a647e89e34bc932b093c5bcb4d238b1b9d958bce
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+
+#if defined( KOKKOS_ENABLE_STDTHREAD) || defined( _WIN32 )
+  #include <thread>
+#elif !defined( _WIN32 )
+  #include <sched.h>
+  #include <time.h>
+#else
+  #include <process.h>
+  #include <winsock2.h>
+  #include <windows.h>
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+void host_thread_yield( const uint32_t i , const WaitMode mode )
+{
+  static constexpr uint32_t sleep_limit = 1 << 13 ;
+  static constexpr uint32_t yield_limit = 1 << 12 ;
+
+  const int c = Kokkos::Impl::bit_scan_reverse(i);
+
+  if ( WaitMode::ROOT != mode ) {
+    if ( sleep_limit < i ) {
+
+      // Attempt to put the thread to sleep for 'c' milliseconds
+
+      #if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
+        auto start = std::chrono::high_resolution_clock::now();
+        std::this_thread::yield();
+        std::this_thread::sleep_until( start + std::chrono::nanoseconds( c * 1000 ) );
+      #else
+        timespec req ;
+        req.tv_sec  = 0 ;
+        req.tv_nsec = 1000 * c ;
+        nanosleep( &req, nullptr );
+      #endif
+    }
+
+    else if ( mode == WaitMode::PASSIVE || yield_limit < i ) {
+
+      // Attempt to yield thread resources to runtime
+
+      #if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
+        std::this_thread::yield();
+      #else
+        sched_yield();
+      #endif
+    }
+
+    #if defined( KOKKOS_ENABLE_ASM )
+
+    else if ( (1u<<4) < i ) {
+
+      // Insert a few no-ops to quiet the thread:
+
+      for ( int k = 0 ; k < c ; ++k ) {
+        #if defined( __amd64 ) || defined( __amd64__ ) || \
+              defined( __x86_64 ) || defined( __x86_64__ )
+          #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
+            asm volatile( "nop\n" );
+          #else
+            __asm__ __volatile__( "nop\n" );
+          #endif
+        #elif defined(__PPC64__)
+            asm volatile( "nop\n" );
+        #endif
+      }
+    }
+    #endif /* defined( KOKKOS_ENABLE_ASM ) */
+  }
+  #if defined( KOKKOS_ENABLE_ASM )
+  else if ( (1u<<3) < i ) {
+    // no-ops for root thread
+    for ( int k = 0 ; k < c ; ++k ) {
+      #if defined( __amd64 ) || defined( __amd64__ ) || \
+            defined( __x86_64 ) || defined( __x86_64__ )
+        #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
+          asm volatile( "nop\n" );
+        #else
+          __asm__ __volatile__( "nop\n" );
+        #endif
+      #elif defined(__PPC64__)
+          asm volatile( "nop\n" );
+      #endif
+    }
+  }
+
+  {
+    // Insert memory pause
+      #if defined( __amd64 )  || defined( __amd64__ ) || \
+       	  defined( __x86_64 ) || defined( __x86_64__ )
+    	  #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
+          asm volatile( "pause\n":::"memory" );
+	      #else
+          __asm__ __volatile__( "pause\n":::"memory" );
+        #endif
+      #elif defined(__PPC64__)
+	      asm volatile( "or 27, 27, 27" ::: "memory" );
+      #endif
+  }
+
+  #endif /* defined( KOKKOS_ENABLE_ASM ) */
+}
+
+}} // namespace Kokkos::Impl
+
+#else
+void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp b/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5cad7e48ccaf8c7c5a35775028745fce7d966928
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp
@@ -0,0 +1,143 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_SPINWAIT_HPP
+#define KOKKOS_SPINWAIT_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>
+
+#include <cstdint>
+
+#include <type_traits>
+
+namespace Kokkos {
+namespace Impl {
+
+enum class WaitMode : int {
+    ACTIVE   // Used for tight loops to keep threads active longest
+  , PASSIVE  // Used to quickly yield the thread to quite down the system
+  , ROOT     // Never sleep or yield the root thread
+};
+
+
+void host_thread_yield( const uint32_t i , const WaitMode mode );
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+root_spinwait_while_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value == flag ) {
+    host_thread_yield(++i, WaitMode::ROOT);
+  }
+  Kokkos::load_fence();
+}
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+root_spinwait_until_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value != flag ) {
+    host_thread_yield(++i, WaitMode::ROOT);
+  }
+  Kokkos::load_fence();
+}
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+spinwait_while_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value == flag ) {
+    host_thread_yield(++i, WaitMode::ACTIVE);
+  }
+  Kokkos::load_fence();
+}
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+yield_while_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value == flag ) {
+    host_thread_yield(++i, WaitMode::PASSIVE);
+  }
+  Kokkos::load_fence();
+}
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+spinwait_until_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value != flag ) {
+    host_thread_yield(++i, WaitMode::ACTIVE);
+  }
+  Kokkos::load_fence();
+}
+
+template <typename T>
+typename std::enable_if< std::is_integral<T>::value, void>::type
+yield_until_equal( T const volatile & flag, const T value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ;
+  while( value != flag ) {
+    host_thread_yield(++i, WaitMode::PASSIVE);
+  }
+  Kokkos::load_fence();
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_SPINWAIT_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_StaticAssert.hpp b/packages/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d001e0a88c56c8de3e970c096d3ee0604d897b75
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
@@ -0,0 +1,78 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STATICASSERT_HPP
+#define KOKKOS_STATICASSERT_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template < bool , class T = void >
+struct StaticAssert ;
+
+template< class T >
+struct StaticAssert< true , T > {
+  typedef T type ;
+  static const bool value = true ;
+};
+
+template < class A , class B >
+struct StaticAssertSame ;
+
+template < class A >
+struct StaticAssertSame<A,A> { typedef A type ; };
+
+template < class A , class B >
+struct StaticAssertAssignable ;
+
+template < class A >
+struct StaticAssertAssignable<A,A> { typedef A type ; };
+
+template < class A >
+struct StaticAssertAssignable< const A , A > { typedef const A type ; };
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* KOKKOS_STATICASSERT_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Tags.hpp b/packages/kokkos/core/src/impl/Kokkos_Tags.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0cd84108baae39f707d1c499d5e30421b222ca70
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Tags.hpp
@@ -0,0 +1,89 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TAGS_HPP
+#define KOKKOS_TAGS_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <Kokkos_Core_fwd.hpp>
+#include <type_traits>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+/** KOKKOS_IMPL_HAS_TYPE( Type )
+ *
+ * defines a meta-function that check if a type expose an internal typedef or
+ * type alias which matches Type
+ *
+ * e.g.
+ *   KOKKOS_IMPL_HAS_TYPE( array_layout );
+ *   struct Foo { using array_layout = void; };
+ *   have_array_layout<Foo>::value == 1;
+ */
+#define KOKKOS_IMPL_HAS_TYPE( TYPE ) \
+template <typename T> struct have_ ## TYPE { \
+private: \
+  template <typename U, typename = void > struct X : std::false_type {}; \
+  template <typename U> struct X<U,typename std::conditional<true,void,typename X:: TYPE >::type > : std::true_type {}; \
+public: \
+  typedef typename X<T>::type type ; \
+  enum : bool { value = type::value }; \
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Impl {
+
+template <typename T>
+using is_void = std::is_same<void,T>;
+
+}} // namespace Kokkos::Impl
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7c8890cb9cc67f69b286fda1b795d01b81d624a
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -0,0 +1,561 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_IMPL_TASKQUEUE_HPP
+#define KOKKOS_IMPL_TASKQUEUE_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class Space , typename ResultType , class FunctorType >
+class TaskBase ;
+
+template< typename Space >
+class TaskQueue ;
+
+template< typename Space >
+class TaskQueueSpecialization ;
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Base class for task management, access, and execution.
+ *
+ *  Inheritance structure to allow static_cast from the task root type
+ *  and a task's FunctorType.
+ *
+ *    // Enable a functor to access the base class
+ *    // and provide memory for result value.
+ *    TaskBase< Space , ResultType , FunctorType >
+ *      : TaskBase< void , void , void >
+ *      , FunctorType
+ *      { ... };
+ *    Followed by memory allocated for result value.
+ *
+ *
+ *  States of a task:
+ *
+ *    Constructing State, NOT IN a linked list
+ *      m_wait == 0
+ *      m_next == 0
+ *
+ *    Scheduling transition : Constructing -> Waiting
+ *      before:
+ *        m_wait == 0
+ *        m_next == this task's initial dependence, 0 if none
+ *      after:
+ *        m_wait == EndTag
+ *        m_next == EndTag
+ *
+ *    Waiting State, IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == next of linked list of tasks
+ *
+ *    transition : Waiting -> Executing
+ *      before:
+ *        m_next == EndTag
+ *      after::
+ *        m_next == LockTag
+ *
+ *    Executing State, NOT IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == LockTag
+ *
+ *    Respawn transition : Executing -> Executing-Respawn
+ *      before:
+ *        m_next == LockTag
+ *      after:
+ *        m_next == this task's updated dependence, 0 if none
+ *
+ *    Executing-Respawn State, NOT IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == this task's updated dependence, 0 if none
+ *
+ *    transition : Executing -> Complete
+ *      before:
+ *        m_wait == head of linked list
+ *      after:
+ *        m_wait == LockTag
+ *
+ *    Complete State, NOT IN a linked list
+ *      m_wait == LockTag: cannot add dependence (<=> complete)
+ *      m_next == LockTag: not a member of a wait queue
+ *
+ */
+template<>
+class TaskBase< void , void , void >
+{
+public:
+
+  enum : int16_t   { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 };
+  enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) };
+
+  template< typename > friend class Kokkos::TaskScheduler ;
+
+  typedef TaskQueue< void > queue_type ;
+
+  typedef void (* function_type) ( TaskBase * , void * );
+
+  // sizeof(TaskBase) == 48
+
+  function_type  m_apply ;       ///< Apply function pointer
+  queue_type   * m_queue ;       ///< Pointer to queue
+  TaskBase     * m_wait ;        ///< Linked list of tasks waiting on this
+  TaskBase     * m_next ;        ///< Waiting linked-list next
+  int32_t        m_ref_count ;   ///< Reference count
+  int32_t        m_alloc_size ;  ///< Allocation size
+  int32_t        m_dep_count ;   ///< Aggregate's number of dependences
+  int16_t        m_task_type ;   ///< Type of task
+  int16_t        m_priority ;    ///< Priority of runnable task
+
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION_DEFAULTED ~TaskBase() = default ;
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  TaskBase()
+    : m_apply(      0 )
+    , m_queue(      0 )
+    , m_wait(       0 )
+    , m_next(       0 )
+    , m_ref_count(  0 )
+    , m_alloc_size( 0 )
+    , m_dep_count(  0 )
+    , m_task_type(  0 )
+    , m_priority(   0 )
+    {}
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase * volatile * aggregate_dependences() volatile
+    { return reinterpret_cast<TaskBase*volatile*>( this + 1 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  bool requested_respawn()
+    {
+      // This should only be called when a task has finished executing and is
+      // in the transition to either the complete or executing-respawn state.
+      TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag );
+      return lock != m_next;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskBase* dep )
+    {
+      // Precondition: lock == m_next
+
+      TaskBase * const lock = (TaskBase *) LockTag ;
+
+      // Assign dependence to m_next.  It will be processed in the subsequent
+      // call to schedule.  Error if the dependence is reset.
+      if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
+        Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
+      }
+
+      if ( 0 != dep ) {
+        // The future may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+        Kokkos::atomic_increment( &(dep->m_ref_count) );
+      }
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int32_t reference_count() const
+    { return *((int32_t volatile *)( & m_ref_count )); }
+
+};
+
+static_assert( sizeof(TaskBase<void,void,void>) == 48
+             , "Verifying expected sizeof(TaskBase<void,void,void>)" );
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< typename ResultType >
+struct TaskResult {
+
+  enum : int32_t { size = sizeof(ResultType) };
+
+  using reference_type = ResultType & ;
+
+  KOKKOS_INLINE_FUNCTION static
+  ResultType * ptr( TaskBase<void,void,void> * task )
+    {
+      return reinterpret_cast< ResultType * >
+        ( reinterpret_cast< char * >(task) + task->m_alloc_size - sizeof(ResultType) );
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type get( TaskBase<void,void,void> * task )
+    { return *ptr( task ); }
+};
+
+template<>
+struct TaskResult< void > {
+
+  enum : int32_t { size = 0 };
+
+  using reference_type = void ;
+
+  KOKKOS_INLINE_FUNCTION static
+  void * ptr( TaskBase<void,void,void> * ) { return (void*) 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type get( TaskBase<void,void,void> * ) {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueue< void > {};
+
+/** \brief  Manage task allocation, deallocation, and scheduling.
+ *
+ *  Task execution is deferred to the TaskQueueSpecialization.
+ *  All other aspects of task management have shared implementation.
+ */
+template< typename ExecSpace >
+class TaskQueue : public TaskQueue<void> {
+private:
+
+  friend class TaskQueueSpecialization< ExecSpace > ;
+  friend class Kokkos::TaskScheduler< ExecSpace > ;
+
+  using execution_space = ExecSpace ;
+  using specialization  = TaskQueueSpecialization< execution_space > ;
+  using memory_space    = typename specialization::memory_space ;
+  using device_type     = Kokkos::Device< execution_space , memory_space > ;
+  using memory_pool     = Kokkos::MemoryPool< device_type > ;
+  using task_root_type  = Kokkos::Impl::TaskBase<void,void,void> ;
+
+  struct Destroy {
+    TaskQueue * m_queue ;
+    void destroy_shared_allocation();
+  };
+
+  //----------------------------------------
+
+  enum : int { NumQueue = 3 };
+
+  // Queue is organized as [ priority ][ type ]
+
+  memory_pool               m_memory ;
+  task_root_type * volatile m_ready[ NumQueue ][ 2 ];
+  long                      m_accum_alloc ; // Accumulated number of allocations
+  int                       m_count_alloc ; // Current number of allocations
+  int                       m_max_alloc ;   // Maximum number of allocations
+  int                       m_ready_count ; // Number of ready or executing
+
+  //----------------------------------------
+
+  ~TaskQueue();
+  TaskQueue() = delete ;
+  TaskQueue( TaskQueue && ) = delete ;
+  TaskQueue( TaskQueue const & ) = delete ;
+  TaskQueue & operator = ( TaskQueue && ) = delete ;
+  TaskQueue & operator = ( TaskQueue const & ) = delete ;
+
+  TaskQueue( const memory_pool & arg_memory_pool );
+
+  // Schedule a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next is the dependence or zero
+  //   Postcondition:
+  //     task->m_next is linked list membership
+  KOKKOS_FUNCTION void schedule_runnable(  task_root_type * const );
+  KOKKOS_FUNCTION void schedule_aggregate( task_root_type * const );
+
+  // Reschedule a task
+  //   Precondition:
+  //     task is in Executing state
+  //     task->m_next == LockTag
+  //   Postcondition:
+  //     task is in Executing-Respawn state
+  //     task->m_next == 0 (no dependence)
+  KOKKOS_FUNCTION
+  void reschedule( task_root_type * );
+
+  // Complete a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next == LockTag  =>  task is complete
+  //     task->m_next != LockTag  =>  task is respawn
+  //   Postcondition:
+  //     task->m_wait == LockTag  =>  task is complete
+  //     task->m_wait != LockTag  =>  task is waiting
+  KOKKOS_FUNCTION
+  void complete( task_root_type * );
+
+  KOKKOS_FUNCTION
+  static bool push_task( task_root_type * volatile * const
+                       , task_root_type * const );
+
+  KOKKOS_FUNCTION
+  static task_root_type * pop_ready_task( task_root_type * volatile * const );
+
+  KOKKOS_FUNCTION static
+  void decrement( task_root_type * task );
+
+public:
+
+  // If and only if the execution space is a single thread
+  // then execute ready tasks.
+  KOKKOS_INLINE_FUNCTION
+  void iff_single_thread_recursive_execute()
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      specialization::iff_single_thread_recursive_execute( this );
+#endif
+    }
+
+  void execute() { specialization::execute( this ); }
+
+  template< typename FunctorType >
+  void proc_set_apply( typename task_root_type::function_type * ptr )
+    {
+      specialization::template proc_set_apply< FunctorType >( ptr );
+    }
+
+  // Assign task pointer with reference counting of assigned tasks
+  KOKKOS_FUNCTION static
+  void assign( task_root_type ** const lhs
+             , task_root_type *  const rhs )
+    {
+#if 0
+  {
+    printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n"
+          , uintptr_t( lhs ? *lhs : 0 )
+          , uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 )
+          , int( lhs && *lhs ? (*lhs)->m_task_type : 0 )
+          , int( lhs && *lhs ? (*lhs)->m_ref_count : 0 )
+          , uintptr_t(rhs)
+          , uintptr_t( rhs ? rhs->m_next : 0 )
+          , int( rhs ? rhs->m_task_type : 0 )
+          , int( rhs ? rhs->m_ref_count : 0 )
+          );
+    fflush( stdout );
+  }
+#endif
+
+      if ( *lhs ) decrement( *lhs );
+      if ( rhs ) { Kokkos::atomic_increment( &(rhs->m_ref_count) ); }
+
+      // Force write of *lhs
+
+      *static_cast< task_root_type * volatile * >(lhs) = rhs ;
+
+      Kokkos::memory_fence();
+    }
+
+  KOKKOS_FUNCTION
+  size_t allocate_block_size( size_t n ); ///< Actual block size allocated
+
+  KOKKOS_FUNCTION
+  void * allocate( size_t n ); ///< Allocate from the memory pool
+
+  KOKKOS_FUNCTION
+  void deallocate( void * p , size_t n ); ///< Deallocate to the memory pool
+
+
+  //----------------------------------------
+  /**\brief  Allocation size for a spawned task */
+
+  template< typename FunctorType >
+  KOKKOS_FUNCTION
+  size_t spawn_allocation_size() const
+    {
+      using value_type = typename FunctorType::value_type ;
+
+      using task_type = Impl::TaskBase< execution_space
+                                      , value_type
+                                      , FunctorType > ;
+
+      enum : size_t { align = ( 1 << 4 ) , align_mask = align - 1 };
+      enum : size_t { task_size   = sizeof(task_type) };
+      enum : size_t { result_size = Impl::TaskResult< value_type >::size };
+      enum : size_t { alloc_size =
+        ( ( task_size   + align_mask ) & ~align_mask ) +
+        ( ( result_size + align_mask ) & ~align_mask ) };
+
+      return m_memory.allocate_block_size( task_size );
+    }
+
+  /**\brief  Allocation size for a when_all aggregate */
+
+  KOKKOS_FUNCTION
+  size_t when_all_allocation_size( int narg ) const
+    {
+      return m_memory.allocate_block_size( sizeof(task_root_type) + narg * sizeof(task_root_type*) );
+    }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class ExecSpace , typename ResultType , class FunctorType >
+class TaskBase
+  : public TaskBase< void , void , void >
+  , public FunctorType
+{
+private:
+
+  TaskBase() = delete ;
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+public:
+
+  using root_type       = TaskBase< void , void , void > ;
+  using functor_type    = FunctorType ;
+  using result_type     = ResultType ;
+
+  using specialization  = TaskQueueSpecialization< ExecSpace > ;
+  using member_type     = typename specialization::member_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  void apply_functor( member_type * const member , void * )
+    { functor_type::operator()( *member ); }
+
+  template< typename T >
+  KOKKOS_INLINE_FUNCTION
+  void apply_functor( member_type * const member
+                    , T           * const result )
+    { functor_type::operator()( *member , *result ); }
+
+  KOKKOS_FUNCTION static
+  void apply( root_type * root , void * exec )
+    {
+      TaskBase    * const task   = static_cast< TaskBase * >( root );
+      member_type * const member = reinterpret_cast< member_type * >( exec );
+      result_type * const result = TaskResult< result_type >::ptr( task );
+
+      // Task may be serial or team.
+      // If team then must synchronize before querying if respawn was requested.
+      // If team then only one thread calls destructor.
+
+      const bool only_one_thread =
+#if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
+        0 == threadIdx.x && 0 == threadIdx.y ;
+#else
+        0 == member->team_rank();
+#endif
+
+      task->apply_functor( member , result );
+
+      member->team_barrier();
+
+      if ( only_one_thread && !(task->requested_respawn()) ) {
+        // Did not respawn, destroy the functor to free memory.
+        static_cast<functor_type*>(task)->~functor_type();
+        // Cannot destroy and deallocate the task until its dependences
+        // have been processed.
+      }
+    }
+
+  // Constructor for runnable task
+  KOKKOS_INLINE_FUNCTION constexpr
+  TaskBase( FunctorType && arg_functor )
+    : root_type() , functor_type( arg_functor ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~TaskBase() {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
+#endif /* #ifndef KOKKOS_IMPL_TASKQUEUE_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5bcf672ff638ded5179528bc7473e5ab96ecd636
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@@ -0,0 +1,682 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+#define KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING 0
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+void TaskQueue< ExecSpace >::Destroy::destroy_shared_allocation()
+{
+  m_queue->~TaskQueue();
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::TaskQueue
+  ( typename TaskQueue< ExecSpace >::memory_pool const & arg_memory_pool )
+  : m_memory( arg_memory_pool )
+  , m_ready()
+  , m_accum_alloc(0)
+  , m_count_alloc(0)
+  , m_max_alloc(0)
+  , m_ready_count(0)
+{
+  for ( int i = 0 ; i < NumQueue ; ++i ) {
+    m_ready[i][0] = (task_root_type *) task_root_type::EndTag ;
+    m_ready[i][1] = (task_root_type *) task_root_type::EndTag ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::~TaskQueue()
+{
+  // Verify that queues are empty and ready count is zero
+
+  for ( int i = 0 ; i < NumQueue ; ++i ) {
+    for ( int j = 0 ; j < 2 ; ++j ) {
+      if ( m_ready[i][j] != (task_root_type *) task_root_type::EndTag ) {
+        Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready tasks");
+      }
+    }
+  }
+
+  if ( 0 != m_ready_count ) {
+    Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready or executing tasks");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::decrement
+  ( TaskQueue< ExecSpace >::task_root_type * task )
+{
+  task_root_type volatile & t = *task ;
+
+  const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count),-1);
+
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
+  if ( 1 == count ) {
+    printf( "decrement-destroy( 0x%lx { 0x%lx %d %d } )\n"
+          , uintptr_t( task )
+          , uintptr_t( task->m_next )
+          , int( task->m_task_type )
+          , int( task->m_ref_count )
+          );
+  }
+#endif
+
+  if ( ( 1 == count ) &&
+       ( t.m_next == (task_root_type *) task_root_type::LockTag ) ) {
+    // Reference count is zero and task is complete, deallocate.
+
+    TaskQueue< ExecSpace > * const queue =
+      static_cast< TaskQueue< ExecSpace > * >( t.m_queue );
+
+    queue->deallocate( task , t.m_alloc_size );
+  }
+  else if ( count <= 1 ) {
+    Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+size_t TaskQueue< ExecSpace >::allocate_block_size( size_t n )
+{
+  return m_memory.allocate_block_size( n );
+}
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void * TaskQueue< ExecSpace >::allocate( size_t n )
+{
+  void * const p = m_memory.allocate(n);
+
+  if ( p ) {
+    Kokkos::atomic_increment( & m_accum_alloc );
+    Kokkos::atomic_increment( & m_count_alloc );
+
+    if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
+  }
+
+  return p ;
+}
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::deallocate( void * p , size_t n )
+{
+  m_memory.deallocate( p , n );
+  Kokkos::atomic_decrement( & m_count_alloc );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+bool TaskQueue< ExecSpace >::push_task
+  ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue
+  , TaskQueue< ExecSpace >::task_root_type * const task
+  )
+{
+  // Push task into a concurrently pushed and popped queue.
+  // The queue can be either a ready task queue or a waiting task queue.
+  // The queue is a linked list where 'task->m_next' form the links.
+  // Fail the push attempt if the queue is locked;
+  // otherwise retry until the push succeeds.
+
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
+  printf( "push_task( 0x%lx { 0x%lx } 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
+        , uintptr_t(queue)
+        , uintptr_t(*queue)
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+
+  task_root_type * volatile & next = task->m_next ;
+
+  if ( zero != next ) {
+    Kokkos::abort("TaskQueue::push_task ERROR: already a member of another queue" );
+  }
+
+  task_root_type * y = *queue ;
+
+  while ( lock != y ) {
+
+    next = y ;
+
+    // Do not proceed until 'next' has been stored.
+    Kokkos::memory_fence();
+
+    task_root_type * const x = y ;
+
+    y = Kokkos::atomic_compare_exchange(queue,y,task);
+
+    if ( x == y ) return true ;
+  }
+
+  // Failed, replace 'task->m_next' value since 'task' remains
+  // not a member of a queue.
+
+  next = zero ;
+
+  // Do not proceed until 'next' has been stored.
+  Kokkos::memory_fence();
+
+  return false ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+typename TaskQueue< ExecSpace >::task_root_type *
+TaskQueue< ExecSpace >::pop_ready_task
+  ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue )
+{
+  // Pop task from a concurrently pushed and popped ready task queue.
+  // The queue is a linked list where 'task->m_next' form the links.
+
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+  // *queue is
+  //   end   => an empty queue
+  //   lock  => a locked queue
+  //   valid
+
+  // Retry until the lock is acquired or the queue is empty.
+
+  task_root_type * task = *queue ;
+
+  while ( end != task ) {
+
+    // The only possible values for the queue are
+    // (1) lock, (2) end, or (3) a valid task.
+    // Thus zero will never appear in the queue.
+    //
+    // If queue is locked then just read by guaranteeing the CAS will fail.
+
+    if ( lock == task ) task = 0 ;
+
+    task_root_type * const x = task ;
+
+    task = Kokkos::atomic_compare_exchange(queue,x,lock);
+
+    if ( x == task ) {
+      // CAS succeeded and queue is locked
+      //
+      // This thread has locked the queue and removed 'task' from the queue.
+      // Extract the next entry of the queue from 'task->m_next'
+      // and mark 'task' as popped from a queue by setting
+      // 'task->m_next = lock'.
+      //
+      // Place the next entry in the head of the queue,
+      // which also unlocks the queue.
+      //
+      // This thread has exclusive access to
+      // the queue and the popped task's m_next.
+
+      task_root_type * volatile & next = task->m_next ;
+
+      *queue = next ; next = lock ;
+
+      Kokkos::memory_fence();
+
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
+      printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
+            , uintptr_t(queue)
+            , uintptr_t(task)
+            , uintptr_t(task->m_wait)
+            , uintptr_t(task->m_next)
+            , int(task->m_task_type)
+            , int(task->m_priority)
+            , int(task->m_ref_count) );
+#endif
+
+      return task ;
+    }
+  }
+
+  return end ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule_runnable
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+  // Schedule a runnable task upon construction / spawn
+  // and upon completion of other tasks that 'task' is waiting on.
+  //
+  // Precondition:
+  // - called by a single thread for the input task
+  // - calling thread has exclusive access to the task
+  // - task is not a member of a queue
+  // - if runnable then task is either constructing or respawning
+  //
+  //   Constructing state:
+  //     task->m_wait == 0
+  //     task->m_next == dependence or 0
+  //   Respawn state:
+  //     task->m_wait == head of linked list: 'end' or valid task
+  //     task->m_next == dependence or 0
+  //
+  //  Task state transition:
+  //     Constructing ->  Waiting
+  //     Respawn      ->  Waiting
+  //
+  //  Postcondition on task state:
+  //     task->m_wait == head of linked list (queue)
+  //     task->m_next == member of linked list (queue)
+
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
+  printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+  task_root_type volatile & t = *task ;
+
+  bool respawn = false ;
+
+  //----------------------------------------
+
+  if ( zero == t.m_wait ) {
+    // Task in Constructing state
+    // - Transition to Waiting state
+    // Preconditions:
+    // - call occurs exclusively within a single thread
+
+    t.m_wait = end ;
+    // Task in Waiting state
+  }
+  else if ( lock != t.m_wait ) {
+    // Task in Executing state with Respawn request
+    // - Update dependence
+    // - Transition to Waiting state
+    respawn = true ;
+  }
+  else {
+    // Task in Complete state
+    Kokkos::abort("TaskQueue::schedule_runnable ERROR: task is complete");
+  }
+
+  //----------------------------------------
+  // Scheduling a runnable task which may have a depencency 'dep'.
+  // Extract dependence, if any, from task->m_next.
+  // If 'dep' is not null then attempt to push 'task'
+  // into the wait queue of 'dep'.
+  // If the push succeeds then 'task' may be
+  // processed or executed by another thread at any time.
+  // If the push fails then 'dep' is complete and 'task'
+  // is ready to execute.
+
+  // Exclusive access so don't need an atomic exchange
+  // task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
+  task_root_type * dep = t.m_next ; t.m_next = zero ;
+
+  Kokkos::memory_fence();
+
+  const bool is_ready =
+    ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
+
+  if ( ( 0 != dep ) && respawn ) {
+    // Reference count for dep was incremented when
+    // respawn assigned dependency to task->m_next
+    // so that if dep completed prior to the
+    // above push_task dep would not be destroyed.
+    // dep reference count can now be decremented,
+    // which may deallocate the task.
+    TaskQueue::assign( & dep , (task_root_type *)0 );
+  }
+
+  if ( is_ready ) {
+
+    // No dependence or 'dep' is complete so push task into ready queue.
+    // Increment the ready count before pushing into ready queue
+    // to track number of ready + executing tasks.
+    // The ready count will be decremented when the task is complete.
+
+    Kokkos::atomic_increment( & m_ready_count );
+
+    task_root_type * volatile * const ready_queue =
+      & m_ready[ t.m_priority ][ t.m_task_type ];
+
+    // A push_task fails if the ready queue is locked.
+    // A ready queue is only locked during a push or pop;
+    // i.e., it is never permanently locked.
+    // Retry push to ready queue until it succeeds.
+    // When the push succeeds then 'task' may be
+    // processed or executed by another thread at any time.
+
+    while ( ! push_task( ready_queue , task ) );
+  }
+
+  //----------------------------------------
+  // Postcondition:
+  // - A runnable 'task' was pushed into a wait or ready queue.
+  // - Concurrent execution may have already popped 'task'
+  //   from a queue and processed it as appropriate.
+}
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule_aggregate
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+  // Schedule an aggregate task upon construction
+  // and upon completion of other tasks that 'task' is waiting on.
+  //
+  // Precondition:
+  // - called by a single thread for the input task
+  // - calling thread has exclusive access to the task
+  // - task is not a member of a queue
+  //
+  //   Constructing state:
+  //     task->m_wait == 0
+  //     task->m_next == dependence or 0
+  //
+  //  Task state transition:
+  //     Constructing ->  Waiting
+  //
+  //  Postcondition on task state:
+  //     task->m_wait == head of linked list (queue)
+  //     task->m_next == member of linked list (queue)
+
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
+  printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_dep_count
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+  task_root_type volatile & t = *task ;
+
+  //----------------------------------------
+
+  if ( zero == t.m_wait ) {
+    // Task in Constructing state
+    // - Transition to Waiting state
+    // Preconditions:
+    // - call occurs exclusively within a single thread
+
+    t.m_wait = end ;
+    // Task in Waiting state
+  }
+  else if ( lock == t.m_wait ) {
+    // Task in Complete state
+    Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
+  }
+
+  //----------------------------------------
+  // Scheduling a 'when_all' task with multiple dependences.
+  // This scheduling may be called when the 'when_all' is
+  // (1) created or
+  // (2) being removed from a completed task's wait list.
+
+  task_root_type * volatile * const aggr = t.aggregate_dependences();
+
+  // Assume the 'when_all' is complete until a dependence is
+  // found that is not complete.
+
+  bool is_complete = true ;
+
+  for ( int i = t.m_dep_count ; 0 < i && is_complete ; ) {
+
+    --i ;
+
+    // Loop dependences looking for an incomplete task.
+    // Add this task to the incomplete task's wait queue.
+
+    // Remove a task 'x' from the dependence list.
+    // The reference count of 'x' was incremented when
+    // it was assigned into the dependence list.
+
+    // Exclusive access so don't need an atomic exchange
+    // task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
+    task_root_type * x = aggr[i] ; aggr[i] = zero ;
+
+    if ( x ) {
+
+      // If x->m_wait is not locked then push succeeds
+      // and the aggregate is not complete.
+      // If the push succeeds then this when_all 'task' may be
+      // processed by another thread at any time.
+      // For example, 'x' may be completeed by another
+      // thread and then re-schedule this when_all 'task'.
+
+      is_complete = ! push_task( & x->m_wait , task );
+
+      // Decrement reference count which had been incremented
+      // when 'x' was added to the dependence list.
+
+      TaskQueue::assign( & x , zero );
+    }
+  }
+
+  if ( is_complete ) {
+    // The when_all 'task' was not added to a wait queue because
+    // all dependences were complete so this aggregate is complete.
+    // Complete the when_all 'task' to schedule other tasks
+    // that are waiting for the when_all 'task' to complete.
+
+    t.m_next = lock ;
+
+    complete( task );
+
+    // '*task' may have been deleted upon completion
+  }
+
+  //----------------------------------------
+  // Postcondition:
+  // - An aggregate 'task' was either pushed to a wait queue or completed.
+  // - Concurrent execution may have already popped 'task'
+  //   from a queue and processed it as appropriate.
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::reschedule( task_root_type * task )
+{
+  // Precondition:
+  //   task is in Executing state
+  //   task->m_next == LockTag
+  //
+  // Postcondition:
+  //   task is in Executing-Respawn state
+  //   task->m_next == 0 (no dependence)
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+
+  if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
+    Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::complete
+  ( TaskQueue< ExecSpace >::task_root_type * task )
+{
+  // Complete a runnable task that has finished executing
+  // or a when_all task when all of its dependeneces are complete.
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
+  printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+  task_root_type volatile & t = *task ;
+
+  const bool runnable = task_root_type::Aggregate != t.m_task_type ;
+
+  //----------------------------------------
+
+  if ( runnable && lock != t.m_next ) {
+    // Is a runnable task has finished executing and requested respawn.
+    // Schedule the task for subsequent execution.
+
+    schedule_runnable( task );
+  }
+  //----------------------------------------
+  else {
+    // Is either an aggregate or a runnable task that executed
+    // and did not respawn.  Transition this task to complete.
+
+    // If 'task' is an aggregate then any of the runnable tasks that
+    // it depends upon may be attempting to complete this 'task'.
+    // Must only transition a task once to complete status.
+    // This is controled by atomically locking the wait queue.
+
+    // Stop other tasks from adding themselves to this task's wait queue
+    // by locking the head of this task's wait queue.
+
+    task_root_type * x = Kokkos::atomic_exchange( & t.m_wait , lock );
+
+    if ( x != (task_root_type *) lock ) {
+
+      // This thread has transitioned this 'task' to complete.
+      // 'task' is no longer in a queue and is not executing
+      // so decrement the reference count from 'task's creation.
+      // If no other references to this 'task' then it will be deleted.
+
+      TaskQueue::assign( & task , zero );
+
+      // This thread has exclusive access to the wait list so
+      // the concurrency-safe pop_ready_task function is not needed.
+      // Schedule the tasks that have been waiting on the input 'task',
+      // which may have been deleted.
+
+      while ( x != end ) {
+        // Have exclusive access to 'x' until it is scheduled
+        // Set x->m_next = zero  <=  no dependence, not a respawn
+
+        task_root_type volatile & vx = *x ;
+
+        task_root_type * const next = vx.m_next ; vx.m_next = 0 ;
+
+        Kokkos::memory_fence();
+
+        if ( task_root_type::Aggregate != vx.m_task_type ) {
+          schedule_runnable( x );
+        }
+        else {
+#if !defined( __HCC_ACCELERATOR__ )
+          schedule_aggregate( x );
+#endif
+        }
+
+        x = next ;
+      }
+    }
+  }
+
+  if ( runnable ) {
+    // A runnable task was popped from a ready queue and executed.
+    // If respawned into a ready queue then the ready count was incremented
+    // so decrement whether respawned or not.
+    Kokkos::atomic_decrement( & m_ready_count );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Timer.hpp b/packages/kokkos/core/src/impl/Kokkos_Timer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9897225c3801a3f581f5423559cace0d6eb35dee
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Timer.hpp
@@ -0,0 +1,63 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPLWALLTIME_HPP
+#define KOKKOS_IMPLWALLTIME_HPP
+
+#include <Kokkos_Timer.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Time since construction 
+ *   Timer promoted from Impl to Kokkos ns
+ *   This file included for backwards compatibility
+ */
+
+  using Kokkos::Timer ;
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Traits.hpp b/packages/kokkos/core/src/impl/Kokkos_Traits.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..475a696719a962d571899077ef49dcdef8001809
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Traits.hpp
@@ -0,0 +1,493 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSTRAITS_HPP
+#define KOKKOSTRAITS_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+#include <string>
+#include <type_traits>
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// Help with C++11 variadic argument packs
+
+template< unsigned I , typename ... Pack >
+struct get_type { typedef void type ; };
+
+template< typename T , typename ... Pack >
+struct get_type< 0 , T , Pack ... >
+{ typedef T type ; };
+
+template< unsigned I , typename T , typename ... Pack >
+struct get_type< I , T , Pack ... >
+{ typedef typename get_type< I - 1 , Pack ... >::type type ; };
+
+
+template< typename T , typename ... Pack >
+struct has_type { enum { value = false }; };
+
+template< typename T , typename S , typename ... Pack >
+struct has_type<T,S,Pack...>
+{
+private:
+
+  enum { self_value = std::is_same<T,S>::value };
+
+  typedef has_type<T,Pack...> next ;
+
+  static_assert( ! ( self_value && next::value )
+               , "Error: more than one member of the argument pack matches the type" );
+
+public:
+
+  enum { value = self_value || next::value };
+
+};
+
+
+template< typename DefaultType
+        , template< typename > class Condition
+        , typename ... Pack >
+struct has_condition
+{
+  enum { value = false };
+  typedef DefaultType type ;
+};
+
+template< typename DefaultType
+        , template< typename > class Condition
+        , typename S
+        , typename ... Pack >
+struct has_condition< DefaultType , Condition , S , Pack... >
+{
+private:
+
+  enum { self_value = Condition<S>::value };
+
+  typedef has_condition< DefaultType , Condition , Pack... > next ;
+
+  static_assert( ! ( self_value && next::value )
+               , "Error: more than one member of the argument pack satisfies condition" );
+
+public:
+
+  enum { value = self_value || next::value };
+
+  typedef typename
+    std::conditional< self_value , S , typename next::type >::type
+      type ;
+};
+
+
+template< class ... Args >
+struct are_integral { enum { value = true }; };
+
+template< typename T , class ... Args >
+struct are_integral<T,Args...> {
+  enum { value =
+    // Accept std::is_integral OR std::is_enum as an integral value
+    // since a simple enum value is automically convertable to an
+    // integral value.
+    ( std::is_integral<T>::value || std::is_enum<T>::value )
+    &&
+    are_integral<Args...>::value };
+};
+
+//----------------------------------------------------------------------------
+/* C++11 conformal compile-time type traits utilities.
+ * Prefer to use C++11 when portably available.
+ */
+//----------------------------------------------------------------------------
+// C++11 Helpers:
+
+template < class T , T v >
+struct integral_constant
+{
+  // Declaration of 'static const' causes an unresolved linker symbol in debug
+  // static const T value = v ;
+  enum { value = T(v) };
+  typedef T value_type;
+  typedef integral_constant<T,v> type;
+  KOKKOS_INLINE_FUNCTION operator T() { return v ; }
+};
+
+typedef integral_constant<bool,false> false_type ;
+typedef integral_constant<bool,true>  true_type ;
+
+//----------------------------------------------------------------------------
+// C++11 Type relationships:
+
+template< class X , class Y > struct is_same : public false_type {};
+template< class X >           struct is_same<X,X> : public true_type {};
+
+//----------------------------------------------------------------------------
+// C++11 Type properties:
+
+template <typename T> struct is_const : public false_type {};
+template <typename T> struct is_const<const T> : public true_type {};
+template <typename T> struct is_const<const T & > : public true_type {};
+
+template <typename T> struct is_array : public false_type {};
+template <typename T> struct is_array< T[] > : public true_type {};
+template <typename T, unsigned N > struct is_array< T[N] > : public true_type {};
+
+//----------------------------------------------------------------------------
+// C++11 Type transformations:
+
+template <typename T> struct remove_const { typedef T type; };
+template <typename T> struct remove_const<const T> { typedef T type; };
+template <typename T> struct remove_const<const T & > { typedef T & type; };
+
+template <typename T> struct add_const { typedef const T type; };
+template <typename T> struct add_const<T & > { typedef const T & type; };
+template <typename T> struct add_const<const T> { typedef const T type; };
+template <typename T> struct add_const<const T & > { typedef const T & type; };
+
+template <typename T> struct remove_reference { typedef T type ; };
+template <typename T> struct remove_reference< T & > { typedef T type ; };
+template <typename T> struct remove_reference< const T & > { typedef const T type ; };
+
+template <typename T> struct remove_extent { typedef T type ; };
+template <typename T> struct remove_extent<T[]> { typedef T type ; };
+template <typename T, unsigned N > struct remove_extent<T[N]> { typedef T type ; };
+
+//----------------------------------------------------------------------------
+// C++11 Other type generators:
+
+template< bool , class T , class F >
+struct condition { typedef F type ; };
+
+template< class T , class F >
+struct condition<true,T,F> { typedef T type ; };
+
+template< bool , class = void >
+struct enable_if ;
+
+template< class T >
+struct enable_if< true , T > { typedef T type ; };
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Other traits
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class , class T = void >
+struct enable_if_type { typedef T type ; };
+
+//----------------------------------------------------------------------------
+
+template< bool B >
+struct bool_ : public integral_constant<bool,B> {};
+
+template< unsigned I >
+struct unsigned_ : public integral_constant<unsigned,I> {};
+
+template< int I >
+struct int_ : public integral_constant<int,I> {};
+
+typedef bool_<true> true_;
+typedef bool_<false> false_;
+//----------------------------------------------------------------------------
+// if_
+
+template < bool Cond , typename TrueType , typename FalseType>
+struct if_c
+{
+  enum { value = Cond };
+
+  typedef FalseType type;
+
+
+  typedef typename remove_const<
+          typename remove_reference<type>::type >::type value_type ;
+
+  typedef typename add_const<value_type>::type const_value_type ;
+
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v ) { return v ; }
+
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
+
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const T & , const_value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & , value_type & v ) { return v ; }
+};
+
+template <typename TrueType, typename FalseType>
+struct if_c< true , TrueType , FalseType >
+{
+  enum { value = true };
+
+  typedef TrueType type;
+
+
+  typedef typename remove_const<
+          typename remove_reference<type>::type >::type value_type ;
+
+  typedef typename add_const<value_type>::type const_value_type ;
+
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v ) { return v ; }
+
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
+
+
+  template< class F >
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v , const F & ) { return v ; }
+
+  template< class F >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v , const F & ) { return v ; }
+};
+
+template< typename TrueType >
+struct if_c< false , TrueType , void >
+{
+  enum { value = false };
+
+  typedef void type ;
+  typedef void value_type ;
+};
+
+template< typename FalseType >
+struct if_c< true , void , FalseType >
+{
+  enum { value = true };
+
+  typedef void type ;
+  typedef void value_type ;
+};
+
+template <typename Cond, typename TrueType, typename FalseType>
+struct if_ : public if_c<Cond::value, TrueType, FalseType> {};
+
+//----------------------------------------------------------------------------
+
+// Allows aliased types:
+template< typename T >
+struct is_integral : public integral_constant< bool ,
+  (
+    std::is_same< T ,          char >::value ||
+    std::is_same< T , unsigned char >::value ||
+    std::is_same< T ,          short int >::value ||
+    std::is_same< T , unsigned short int >::value ||
+    std::is_same< T ,          int >::value ||
+    std::is_same< T , unsigned int >::value ||
+    std::is_same< T ,          long int >::value ||
+    std::is_same< T , unsigned long int >::value ||
+    std::is_same< T ,          long long int >::value ||
+    std::is_same< T , unsigned long long int >::value ||
+
+    std::is_same< T , int8_t   >::value ||
+    std::is_same< T , int16_t  >::value ||
+    std::is_same< T , int32_t  >::value ||
+    std::is_same< T , int64_t  >::value ||
+    std::is_same< T , uint8_t  >::value ||
+    std::is_same< T , uint16_t >::value ||
+    std::is_same< T , uint32_t >::value ||
+    std::is_same< T , uint64_t >::value
+  )>
+{};
+//----------------------------------------------------------------------------
+
+template<typename T>
+struct is_label : public false_type {};
+
+template<>
+struct is_label<const char*> : public true_type {};
+
+template<>
+struct is_label<char*> : public true_type {};
+
+
+template<int N>
+struct is_label<const char[N]> : public true_type {};
+
+template<int N>
+struct is_label<char[N]> : public true_type {};
+
+
+template<>
+struct is_label<const std::string> : public true_type {};
+
+template<>
+struct is_label<std::string> : public true_type {};
+
+// These 'constexpr'functions can be used as
+// both regular functions and meta-function.
+
+/**\brief  There exists integral 'k' such that N = 2^k */
+KOKKOS_INLINE_FUNCTION
+constexpr bool is_integral_power_of_two( const size_t N )
+{ return ( 0 < N ) && ( 0 == ( N & ( N - 1 ) ) ); }
+
+/**\brief  Return integral 'k' such that N = 2^k, assuming valid.  */
+KOKKOS_INLINE_FUNCTION
+constexpr unsigned integral_power_of_two_assume_valid( const size_t N )
+{ return N == 1 ? 0 : 1 + integral_power_of_two_assume_valid( N >> 1 ); }
+
+/**\brief  Return integral 'k' such that N = 2^k, if exists.
+ *         If does not exist return ~0u.
+ */
+KOKKOS_INLINE_FUNCTION
+constexpr unsigned integral_power_of_two( const size_t N )
+{ return is_integral_power_of_two(N) ? integral_power_of_two_assume_valid(N) : ~0u ; }
+
+//----------------------------------------------------------------------------
+
+template < size_t N >
+struct is_power_of_two
+{
+  enum type { value = (N > 0) && !(N & (N-1)) };
+};
+
+template < size_t N , bool OK = is_power_of_two<N>::value >
+struct power_of_two ;
+
+template < size_t N >
+struct power_of_two<N,true>
+{
+  enum type { value = 1+ power_of_two<(N>>1),true>::value };
+};
+
+template <>
+struct power_of_two<2,true>
+{
+  enum type { value = 1 };
+};
+
+template <>
+struct power_of_two<1,true>
+{
+  enum type { value = 0 };
+};
+
+/** \brief  If power of two then return power,
+ *          otherwise return ~0u.
+ */
+KOKKOS_FORCEINLINE_FUNCTION
+unsigned power_of_two_if_valid( const unsigned N )
+{
+  unsigned p = ~0u ;
+  if ( is_integral_power_of_two ( N ) ) {
+    p = bit_scan_forward ( N ) ;
+  }
+  return p ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T , T v , bool NonZero = ( v != T(0) ) >
+struct integral_nonzero_constant
+{
+  // Declaration of 'static const' causes an unresolved linker symbol in debug
+  // static const T value = v ;
+  enum { value = T(v) };
+  typedef T value_type ;
+  typedef integral_nonzero_constant<T,v> type ;
+  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & ) {}
+};
+
+template< typename T , T zero >
+struct integral_nonzero_constant<T,zero,false>
+{
+  const T value ;
+  typedef T value_type ;
+  typedef integral_nonzero_constant<T,0> type ;
+  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & v ) : value(v) {}
+};
+
+//----------------------------------------------------------------------------
+
+template < class C > struct is_integral_constant : public false_
+{
+  typedef void integral_type ;
+  enum { integral_value = 0 };
+};
+
+template < typename T , T v >
+struct is_integral_constant< integral_constant<T,v> > : public true_
+{
+  typedef T integral_type ;
+  enum { integral_value = v };
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOSTRAITS_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..868b31861a7b10575b8deb137969cdb2807d8648
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
@@ -0,0 +1,416 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_IMPL_UTILITIES_HPP
+#define KOKKOS_CORE_IMPL_UTILITIES_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cstdint>
+#include <type_traits>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Impl {
+
+// same as std::forward
+// needed to allow perfect forwarding on the device
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+constexpr
+T&& forward( typename std::remove_reference<T>::type& arg ) noexcept
+{ return static_cast<T&&>(arg); }
+
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+constexpr
+T&& forward( typename std::remove_reference<T>::type&& arg ) noexcept
+{ return static_cast<T&&>(arg); }
+
+// same as std::move
+// needed to allowing moving on the device
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+constexpr
+typename std::remove_reference<T>::type&& move( T&& arg ) noexcept
+{ return static_cast<typename std::remove_reference<T>::type&&>(arg); }
+
+// empty function to allow expanding a variadic argument pack
+template<typename... Args>
+KOKKOS_INLINE_FUNCTION
+void expand_variadic(Args &&...) {}
+
+//----------------------------------------
+// C++14 integer sequence
+template< typename T , T ... Ints >
+struct integer_sequence {
+  using value_type = T ;
+  static constexpr std::size_t size() noexcept { return sizeof...(Ints); }
+};
+
+template< typename T , std::size_t N >
+struct make_integer_sequence_helper ;
+
+template< typename T , T N >
+using make_integer_sequence =
+  typename make_integer_sequence_helper<T,N>::type ;
+
+template< typename T >
+struct make_integer_sequence_helper< T , 0 >
+{ using type = integer_sequence<T> ; };
+
+template< typename T >
+struct make_integer_sequence_helper< T , 1 >
+{ using type = integer_sequence<T,0> ; };
+
+template< typename T >
+struct make_integer_sequence_helper< T , 2 >
+{ using type = integer_sequence<T,0,1> ; };
+
+template< typename T >
+struct make_integer_sequence_helper< T , 3 >
+{ using type = integer_sequence<T,0,1,2> ; };
+
+template< typename T >
+struct make_integer_sequence_helper< T , 4 >
+{ using type = integer_sequence<T,0,1,2,3> ; };
+
+template< typename T >
+struct make_integer_sequence_helper< T , 5 >
+{ using type = integer_sequence<T,0,1,2,3,4> ; };
+
+template< typename T >
+struct make_integer_sequence_helper< T , 6 >
+{ using type = integer_sequence<T,0,1,2,3,4,5> ; };
+
+template< typename T >
+struct make_integer_sequence_helper< T , 7 >
+{ using type = integer_sequence<T,0,1,2,3,4,5,6> ; };
+
+template< typename T >
+struct make_integer_sequence_helper< T , 8 >
+{ using type = integer_sequence<T,0,1,2,3,4,5,6,7> ; };
+
+template< typename X , typename Y >
+struct make_integer_sequence_concat ;
+
+template< typename T , T ... x , T ... y >
+struct make_integer_sequence_concat< integer_sequence<T,x...>
+                                   , integer_sequence<T,y...> >
+{ using type = integer_sequence< T , x ... , (sizeof...(x)+y)... > ; };
+
+template< typename T , std::size_t N >
+struct make_integer_sequence_helper {
+  using type = typename make_integer_sequence_concat
+    < typename make_integer_sequence_helper< T , N/2 >::type
+    , typename make_integer_sequence_helper< T , N - N/2 >::type
+    >::type ;
+};
+
+//----------------------------------------
+
+template <std::size_t... Indices>
+using index_sequence = integer_sequence<std::size_t, Indices...>;
+
+template< std::size_t N >
+using make_index_sequence = make_integer_sequence< std::size_t, N>;
+
+//----------------------------------------
+
+template <unsigned I, typename IntegerSequence>
+struct integer_sequence_at;
+
+template <unsigned I, typename T, T h0, T... tail>
+struct integer_sequence_at<I, integer_sequence<T, h0, tail...> >
+  : public integer_sequence_at<I-1u, integer_sequence<T,tail...> >
+{
+  static_assert( 8 <= I , "Reasoning Error" );
+  static_assert( I < integer_sequence<T, h0, tail...>::size(), "Error: Index out of bounds");
+};
+
+template < typename T, T h0, T... tail>
+struct integer_sequence_at<0u, integer_sequence<T,h0, tail...> >
+{
+  using type = T;
+  static constexpr T value = h0;
+};
+
+template < typename T, T h0, T h1, T... tail>
+struct integer_sequence_at<1u, integer_sequence<T, h0, h1, tail...> >
+{
+  using type = T;
+  static constexpr T value = h1;
+};
+
+template < typename T, T h0, T h1, T h2, T... tail>
+struct integer_sequence_at<2u, integer_sequence<T, h0, h1, h2, tail...> >
+{
+  using type = T;
+  static constexpr T value = h2;
+};
+
+template < typename T, T h0, T h1, T h2, T h3, T... tail>
+struct integer_sequence_at<3u, integer_sequence<T, h0, h1, h2, h3, tail...> >
+{
+  using type = T;
+  static constexpr T value = h3;
+};
+
+template < typename T, T h0, T h1, T h2, T h3, T h4, T... tail>
+struct integer_sequence_at<4u, integer_sequence<T, h0, h1, h2, h3, h4, tail...> >
+{
+  using type = T;
+  static constexpr T value = h4;
+};
+
+template < typename T, T h0, T h1, T h2, T h3, T h4, T h5, T... tail>
+struct integer_sequence_at<5u, integer_sequence<T, h0, h1, h2, h3, h4, h5, tail...> >
+{
+  using type = T;
+  static constexpr T value = h5;
+};
+
+template < typename T, T h0, T h1, T h2, T h3, T h4, T h5, T h6, T... tail>
+struct integer_sequence_at<6u, integer_sequence<T, h0, h1, h2, h3, h4, h5, h6, tail...> >
+{
+  using type = T;
+  static constexpr T value = h6;
+};
+
+template < typename T, T h0, T h1, T h2, T h3, T h4, T h5, T h6, T h7, T... tail>
+struct integer_sequence_at<7u, integer_sequence<T, h0, h1, h2, h3, h4, h5, h6, h7, tail...> >
+{
+  using type = T;
+  static constexpr T value = h7;
+};
+
+//----------------------------------------
+
+template <typename T>
+constexpr
+T at( const unsigned, integer_sequence<T> ) noexcept
+{ return ~static_cast<T>(0); }
+
+template <typename T, T h0, T... tail>
+constexpr
+T at( const unsigned i, integer_sequence<T, h0> ) noexcept
+{ return i==0u ? h0 : ~static_cast<T>(0); }
+
+template <typename T, T h0, T h1>
+constexpr
+T at( const unsigned i, integer_sequence<T, h0, h1> ) noexcept
+{ return i==0u ? h0 :
+         i==1u ? h1 : ~static_cast<T>(0);
+}
+
+template <typename T, T h0, T h1, T h2>
+constexpr
+T at( const unsigned i, integer_sequence<T, h0, h1, h2> ) noexcept
+{ return i==0u ? h0 :
+         i==1u ? h1 :
+         i==2u ? h2 : ~static_cast<T>(0);
+}
+
+template <typename T, T h0, T h1, T h2, T h3>
+constexpr
+T at( const unsigned i, integer_sequence<T, h0, h1, h2, h3> ) noexcept
+{ return i==0u ? h0 :
+         i==1u ? h1 :
+         i==2u ? h2 :
+         i==3u ? h3 : ~static_cast<T>(0);
+}
+
+template <typename T, T h0, T h1, T h2, T h3, T h4>
+constexpr
+T at( const unsigned i, integer_sequence<T, h0, h1, h2, h3, h4> ) noexcept
+{ return i==0u ? h0 :
+         i==1u ? h1 :
+         i==2u ? h2 :
+         i==3u ? h3 :
+         i==4u ? h4 : ~static_cast<T>(0);
+}
+
+template <typename T, T h0, T h1, T h2, T h3, T h4, T h5>
+constexpr
+T at( const unsigned i, integer_sequence<T, h0, h1, h2, h3, h4, h5> ) noexcept
+{ return i==0u ? h0 :
+         i==1u ? h1 :
+         i==2u ? h2 :
+         i==3u ? h3 :
+         i==4u ? h4 :
+         i==5u ? h5 : ~static_cast<T>(0);
+}
+
+template <typename T, T h0, T h1, T h2, T h3, T h4, T h5, T h6>
+constexpr
+T at( const unsigned i, integer_sequence<T, h0, h1, h2, h3, h4, h5, h6> ) noexcept
+{ return i==0u ? h0 :
+         i==1u ? h1 :
+         i==2u ? h2 :
+         i==3u ? h3 :
+         i==4u ? h4 :
+         i==5u ? h5 :
+         i==6u ? h6 : ~static_cast<T>(0);
+}
+
+template <typename T, T h0, T h1, T h2, T h3, T h4, T h5, T h6, T h7, T... tail>
+constexpr
+T at( const unsigned i, integer_sequence<T, h0, h1, h2, h3, h4, h5, h6, h7, tail...> ) noexcept
+{ return i==0u ? h0 :
+         i==1u ? h1 :
+         i==2u ? h2 :
+         i==3u ? h3 :
+         i==4u ? h4 :
+         i==5u ? h5 :
+         i==6u ? h6 :
+         i==7u ? h7 : at(i-8u, integer_sequence<T, tail...>{} );
+}
+
+//----------------------------------------
+
+
+template < typename IntegerSequence
+         , typename ResultSequence = integer_sequence<typename IntegerSequence::value_type>
+         >
+struct reverse_integer_sequence_helper;
+
+template <typename T, T h0, T... tail, T... results>
+struct reverse_integer_sequence_helper< integer_sequence<T, h0, tail...>, integer_sequence<T, results...> >
+  : public reverse_integer_sequence_helper< integer_sequence<T, tail...>, integer_sequence<T, h0, results...> >
+{};
+
+template <typename T, T... results>
+struct reverse_integer_sequence_helper< integer_sequence<T>, integer_sequence<T, results...> >
+{
+  using type = integer_sequence<T, results...>;
+};
+
+
+template <typename IntegerSequence>
+using reverse_integer_sequence = typename reverse_integer_sequence_helper<IntegerSequence>::type;
+
+//----------------------------------------
+
+template < typename IntegerSequence
+         , typename Result
+         , typename ResultSequence = integer_sequence<typename IntegerSequence::value_type>
+         >
+struct exclusive_scan_integer_sequence_helper;
+
+template <typename T, T h0, T... tail, typename Result, T... results>
+struct exclusive_scan_integer_sequence_helper
+  < integer_sequence<T, h0, tail...>
+  , Result
+  , integer_sequence<T, results...> >
+  : public exclusive_scan_integer_sequence_helper
+     < integer_sequence<T, tail...>
+     , std::integral_constant<T,Result::value+h0>
+     , integer_sequence<T, 0, (results+h0)...> >
+{};
+
+template <typename T, typename Result, T... results>
+struct exclusive_scan_integer_sequence_helper
+  < integer_sequence<T>, Result, integer_sequence<T, results...> >
+{
+  using type = integer_sequence<T, results...>;
+  static constexpr T value = Result::value ;
+};
+
+template <typename IntegerSequence>
+struct exclusive_scan_integer_sequence
+{
+  using value_type = typename IntegerSequence::value_type;
+  using helper =
+    exclusive_scan_integer_sequence_helper
+       < reverse_integer_sequence<IntegerSequence>
+       , std::integral_constant< value_type , 0 >
+       > ;
+  using type = typename helper::type ;
+  static constexpr value_type value  = helper::value ;
+};
+
+//----------------------------------------
+
+template < typename IntegerSequence
+         , typename Result
+         , typename ResultSequence = integer_sequence<typename IntegerSequence::value_type>
+         >
+struct inclusive_scan_integer_sequence_helper;
+
+template <typename T, T h0, T... tail, typename Result, T... results>
+struct inclusive_scan_integer_sequence_helper
+  < integer_sequence<T, h0, tail...>
+  , Result
+  , integer_sequence<T, results...> >
+  : public inclusive_scan_integer_sequence_helper
+     < integer_sequence<T, tail...>
+     , std::integral_constant<T,Result::value+h0>
+     , integer_sequence<T, h0, (results+h0)...> >
+{};
+
+template <typename T, typename Result, T... results>
+struct inclusive_scan_integer_sequence_helper
+  < integer_sequence<T>, Result, integer_sequence<T, results...> >
+{
+  using type = integer_sequence<T, results...>;
+  static constexpr T value = Result::value ;
+};
+
+template <typename IntegerSequence>
+struct inclusive_scan_integer_sequence
+{
+  using value_type = typename IntegerSequence::value_type;
+  using helper =
+    inclusive_scan_integer_sequence_helper
+       < reverse_integer_sequence<IntegerSequence>
+       , std::integral_constant< value_type , 0 >
+       > ;
+  using type = typename helper::type ;
+  static constexpr value_type value  = helper::value ;
+};
+
+}} // namespace Kokkos::Impl
+
+
+#endif //KOKKOS_CORE_IMPL_UTILITIES_HPP
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f23db87a98d88b0633048c8517335e2311610f2b
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp
@@ -0,0 +1,604 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP
+#define KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP
+
+#include <Kokkos_Array.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template< class DataType , class ArrayLayout , class V , size_t N , class P >
+struct ViewDataAnalysis< DataType , ArrayLayout , Kokkos::Array<V,N,P> >
+{
+private:
+
+  typedef ViewArrayAnalysis<DataType> array_analysis ;
+
+  static_assert( std::is_same<P,void>::value , "" );
+  static_assert( std::is_same<typename array_analysis::non_const_value_type , Kokkos::Array<V,N,P> >::value , "" );
+  static_assert( std::is_scalar<V>::value , "View of Array type must be of a scalar type" );
+
+public:
+
+  typedef Kokkos::Array<>  specialize ;
+
+  typedef typename array_analysis::dimension  dimension ;
+
+private:
+
+  enum { is_const = std::is_same< typename array_analysis::value_type
+                                , typename array_analysis::const_value_type
+                                >::value };
+
+  typedef typename dimension::template append<N>::type array_scalar_dimension ;
+
+  typedef typename std::conditional< is_const , const V , V >::type  scalar_type ;
+  typedef V       non_const_scalar_type ;
+  typedef const V const_scalar_type ;
+
+public:
+
+  typedef typename array_analysis::value_type            value_type ;
+  typedef typename array_analysis::const_value_type      const_value_type ;
+  typedef typename array_analysis::non_const_value_type  non_const_value_type ;
+
+  typedef typename ViewDataType<           value_type , dimension >::type  type ;
+  typedef typename ViewDataType<     const_value_type , dimension >::type  const_type ;
+  typedef typename ViewDataType< non_const_value_type , dimension >::type  non_const_type ;
+
+  typedef typename ViewDataType<           scalar_type , array_scalar_dimension >::type  scalar_array_type ;
+  typedef typename ViewDataType<     const_scalar_type , array_scalar_dimension >::type  const_scalar_array_type ;
+  typedef typename ViewDataType< non_const_scalar_type , array_scalar_dimension >::type  non_const_scalar_array_type ;
+};
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  View mapping for non-specialized data type and standard layout */
+template< class Traits >
+class ViewMapping< Traits ,
+  typename std::enable_if<(
+    std::is_same< typename Traits::specialize , Kokkos::Array<> >::value &&
+    ( std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value )
+  )>::type >
+{
+private:
+
+  template< class , class ... > friend class ViewMapping ;
+  template< class , class ... > friend class Kokkos::View ;
+
+  typedef ViewOffset< typename Traits::dimension
+                    , typename Traits::array_layout
+                    , void
+                    >  offset_type ;
+
+  typedef typename Traits::value_type::pointer handle_type ;
+
+  handle_type  m_handle ;
+  offset_type  m_offset ;
+  size_t       m_stride ;
+
+  typedef typename Traits::value_type::value_type scalar_type ;
+
+  typedef Kokkos::Array< scalar_type , ~size_t(0) , Kokkos::Array<>::contiguous >  contiguous_reference ;
+  typedef Kokkos::Array< scalar_type , ~size_t(0) , Kokkos::Array<>::strided >     strided_reference ;
+
+  enum { is_contiguous_reference =
+    ( Traits::rank == 0 ) || ( std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ) };
+
+  enum { Array_N = Traits::value_type::size() };
+  enum { Array_S = is_contiguous_reference ? Array_N : 1 };
+
+  KOKKOS_INLINE_FUNCTION
+  ViewMapping( const handle_type & arg_handle , const offset_type & arg_offset )
+    : m_handle( arg_handle )
+    , m_offset( arg_offset )
+    , m_stride( is_contiguous_reference ? 0 : arg_offset.span() )
+    {}
+
+public:
+
+  //----------------------------------------
+  // Domain dimensions
+
+  enum { Rank = Traits::dimension::rank };
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr size_t extent( const iType & r ) const
+    { return m_offset.m_dim.extent(r); }
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename Traits::array_layout layout() const
+    { return m_offset.layout(); }
+
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_offset.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_offset.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_offset.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_offset.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_offset.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_offset.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_offset.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_offset.dimension_7(); }
+
+  // Is a regular layout with uniform striding for each index.
+  using is_regular = typename offset_type::is_regular ;
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_offset.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_offset.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_offset.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_offset.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_offset.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_offset.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_offset.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_offset.stride_7(); }
+
+  //----------------------------------------
+  // Range span
+
+  /** \brief  Span of the mapped range */
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const
+    { return m_offset.span() * Array_N ; }
+
+  /** \brief  Is the mapped range span contiguous */
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const
+    { return m_offset.span_is_contiguous(); }
+
+  typedef typename std::conditional< is_contiguous_reference , contiguous_reference , strided_reference >::type  reference_type ;
+
+  typedef handle_type pointer_type ;
+
+  /** \brief  If data references are lvalue_reference than can query pointer to memory */
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const
+    { return m_handle ; }
+
+  //----------------------------------------
+  // The View class performs all rank and bounds checking before
+  // calling these element reference methods.
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference() const { return reference_type( m_handle + 0 , Array_N , 0 ); }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type
+  reference( const I0 & i0 ) const
+    { return reference_type( m_handle + m_offset(i0) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 ) const
+    { return reference_type( m_handle + m_offset(i0,i1) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2,i3) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2,i3,i4) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2,i3,i4,i5) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 , const I6 & i6 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2,i3,i4,i5,i6) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2,i3,i4,i5,i6,i7) * Array_S , Array_N , m_stride ); }
+
+  //----------------------------------------
+
+private:
+
+  enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ };
+  enum { MemorySpanSize = sizeof(scalar_type) };
+
+public:
+
+  /** \brief  Span, in bytes, of the referenced memory */
+  KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const
+    {
+      return ( m_offset.span() * Array_N * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION ~ViewMapping() {}
+  KOKKOS_INLINE_FUNCTION ViewMapping() : m_handle(), m_offset(), m_stride(0) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping( const ViewMapping & rhs )
+    : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ), m_stride( rhs.m_stride ) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( const ViewMapping & rhs )
+    { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; m_stride = rhs.m_stride ; ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION ViewMapping( ViewMapping && rhs )
+    : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ), m_stride( rhs.m_stride ) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( ViewMapping && rhs )
+    { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; m_stride = rhs.m_stride ; return *this ; }
+
+  //----------------------------------------
+
+  template< class ... Args >
+  KOKKOS_INLINE_FUNCTION
+  ViewMapping( pointer_type ptr , Args ... args )
+    : m_handle( ptr )
+    , m_offset( std::integral_constant< unsigned , 0 >() , args... )
+    , m_stride( m_offset.span() )
+    {}
+
+  //----------------------------------------
+
+  template< class ... P >
+  Kokkos::Impl::SharedAllocationRecord<> *
+  allocate_shared( Kokkos::Impl::ViewCtorProp< P... > const & arg_prop
+                 , typename Traits::array_layout const & arg_layout
+                 )
+  {
+    typedef Kokkos::Impl::ViewCtorProp< P... > alloc_prop ;
+
+    typedef typename alloc_prop::execution_space  execution_space ;
+    typedef typename Traits::memory_space         memory_space ;
+    typedef ViewValueFunctor< execution_space , scalar_type > functor_type ;
+    typedef Kokkos::Impl::SharedAllocationRecord< memory_space , functor_type > record_type ;
+
+    // Query the mapping for byte-size of allocation.
+    typedef std::integral_constant< unsigned ,
+      alloc_prop::allow_padding ? sizeof(scalar_type) : 0 > padding ;
+
+    m_offset = offset_type( padding(), arg_layout );
+
+    const size_t alloc_size =
+      ( m_offset.span() * Array_N * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
+
+    // Allocate memory from the memory space and create tracking record.
+    record_type * const record =
+      record_type::allocate( ((Kokkos::Impl::ViewCtorProp<void,memory_space> const &) arg_prop ).value
+                           , ((Kokkos::Impl::ViewCtorProp<void,std::string>  const &) arg_prop ).value
+                           , alloc_size );
+
+    if ( alloc_size ) {
+      m_handle =
+        handle_type( reinterpret_cast< pointer_type >( record->data() ) );
+
+      if ( alloc_prop::initialize ) {
+        // The functor constructs and destroys
+        record->m_destroy = functor_type( ((Kokkos::Impl::ViewCtorProp<void,execution_space> const & )arg_prop).value
+                                        , (pointer_type) m_handle
+                                        , m_offset.span() * Array_N
+                                        );
+
+        record->m_destroy.construct_shared_allocation();
+      }
+    }
+
+    return record ;
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/** \brief  Assign compatible default mappings */
+
+template< class DstTraits , class SrcTraits >
+class ViewMapping< DstTraits , SrcTraits ,
+  typename std::enable_if<(
+    std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value
+    &&
+    std::is_same< typename DstTraits::specialize , Kokkos::Array<> >::value
+    &&
+    (
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+    )
+    &&
+    std::is_same< typename SrcTraits::specialize , Kokkos::Array<> >::value
+    &&
+    (
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+    )
+  )>::type >
+{
+public:
+
+  enum { is_assignable = true };
+
+  typedef Kokkos::Impl::SharedAllocationTracker  TrackType ;
+  typedef ViewMapping< DstTraits , void >  DstType ;
+  typedef ViewMapping< SrcTraits , void >  SrcType ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
+    {
+      static_assert( std::is_same< typename DstTraits::value_type , typename SrcTraits::value_type >::value ||
+                     std::is_same< typename DstTraits::value_type , typename SrcTraits::const_value_type >::value
+                   , "View assignment must have same value type or const = non-const" );
+
+      static_assert( ViewDimensionAssignable< typename DstTraits::dimension , typename SrcTraits::dimension >::value
+                   , "View assignment must have compatible dimensions" );
+
+      static_assert( std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value ||
+                     std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value ||
+                     ( DstTraits::dimension::rank == 0 ) ||
+                     ( DstTraits::dimension::rank == 1 && DstTraits::dimension::rank_dynamic == 1 )
+                   , "View assignment must have compatible layout or have rank <= 1" );
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+
+      dst.m_offset = dst_offset_type( src.m_offset );
+      dst.m_handle = src.m_handle ;
+      dst.m_stride = src.m_stride ;
+    }
+};
+
+/** \brief Assign Array to non-Array */
+
+template< class DstTraits , class SrcTraits >
+class ViewMapping< DstTraits , SrcTraits ,
+  typename std::enable_if<(
+    std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value
+    &&
+    std::is_same< typename DstTraits::specialize , void >::value
+    &&
+    (
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+    )
+    &&
+    std::is_same< typename SrcTraits::specialize , Kokkos::Array<> >::value
+    &&
+    (
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+    )
+  )>::type >
+{
+public:
+
+  // Can only convert to View::array_type
+
+  enum { is_assignable = std::is_same< typename DstTraits::data_type ,    typename SrcTraits::scalar_array_type >::value &&
+                         std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value };
+
+  typedef Kokkos::Impl::SharedAllocationTracker  TrackType ;
+  typedef ViewMapping< DstTraits , void >  DstType ;
+  typedef ViewMapping< SrcTraits , void >  SrcType ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
+    {
+      static_assert( is_assignable , "Can only convert to array_type" );
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+
+      // Array dimension becomes the last dimension.
+      // Arguments beyond the destination rank are ignored.
+      if ( src.span_is_contiguous() ) { // not padded
+        dst.m_offset = dst_offset_type( std::integral_constant<unsigned,0>() ,
+          typename DstTraits::array_layout
+            ( ( 0 < SrcType::Rank ? src.dimension_0() : SrcTraits::value_type::size() )
+            , ( 1 < SrcType::Rank ? src.dimension_1() : SrcTraits::value_type::size() )
+            , ( 2 < SrcType::Rank ? src.dimension_2() : SrcTraits::value_type::size() )
+            , ( 3 < SrcType::Rank ? src.dimension_3() : SrcTraits::value_type::size() )
+            , ( 4 < SrcType::Rank ? src.dimension_4() : SrcTraits::value_type::size() )
+            , ( 5 < SrcType::Rank ? src.dimension_5() : SrcTraits::value_type::size() )
+            , ( 6 < SrcType::Rank ? src.dimension_6() : SrcTraits::value_type::size() )
+            , ( 7 < SrcType::Rank ? src.dimension_7() : SrcTraits::value_type::size() )
+            ) );
+      }
+      else { // is padded
+        typedef std::integral_constant<unsigned,sizeof(typename SrcTraits::value_type::value_type)> padded ;
+
+        dst.m_offset = dst_offset_type( padded() ,
+          typename DstTraits::array_layout
+            ( ( 0 < SrcType::Rank ? src.dimension_0() : SrcTraits::value_type::size() )
+            , ( 1 < SrcType::Rank ? src.dimension_1() : SrcTraits::value_type::size() )
+            , ( 2 < SrcType::Rank ? src.dimension_2() : SrcTraits::value_type::size() )
+            , ( 3 < SrcType::Rank ? src.dimension_3() : SrcTraits::value_type::size() )
+            , ( 4 < SrcType::Rank ? src.dimension_4() : SrcTraits::value_type::size() )
+            , ( 5 < SrcType::Rank ? src.dimension_5() : SrcTraits::value_type::size() )
+            , ( 6 < SrcType::Rank ? src.dimension_6() : SrcTraits::value_type::size() )
+            , ( 7 < SrcType::Rank ? src.dimension_7() : SrcTraits::value_type::size() )
+            ) );
+      }
+
+      dst.m_handle = src.m_handle ;
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class SrcTraits , class ... Args >
+struct ViewMapping
+  < typename std::enable_if<(
+      std::is_same< typename SrcTraits::specialize , Kokkos::Array<> >::value
+      &&
+      (
+        std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+        std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+        std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+      )
+    )>::type
+  , SrcTraits
+  , Args ... >
+{
+private:
+
+  static_assert( SrcTraits::rank == sizeof...(Args) , "" );
+
+  enum : bool
+    { R0 = is_integral_extent<0,Args...>::value
+    , R1 = is_integral_extent<1,Args...>::value
+    , R2 = is_integral_extent<2,Args...>::value
+    , R3 = is_integral_extent<3,Args...>::value
+    , R4 = is_integral_extent<4,Args...>::value
+    , R5 = is_integral_extent<5,Args...>::value
+    , R6 = is_integral_extent<6,Args...>::value
+    , R7 = is_integral_extent<7,Args...>::value
+    };
+
+  enum { rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+              + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
+
+  // Whether right-most rank is a range.
+  enum { R0_rev = 0 == SrcTraits::rank ? false : (
+                  1 == SrcTraits::rank ? R0 : (
+                  2 == SrcTraits::rank ? R1 : (
+                  3 == SrcTraits::rank ? R2 : (
+                  4 == SrcTraits::rank ? R3 : (
+                  5 == SrcTraits::rank ? R4 : (
+                  6 == SrcTraits::rank ? R5 : (
+                  7 == SrcTraits::rank ? R6 : R7 ))))))) };
+
+  // Subview's layout
+  typedef typename std::conditional<
+      ( /* Same array layout IF */
+        ( rank == 0 ) /* output rank zero */
+        ||
+        // OutputRank 1 or 2, InputLayout Left, Interval 0
+        // because single stride one or second index has a stride.
+        ( rank <= 2 && R0 && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value )
+        ||
+        // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
+        // because single stride one or second index has a stride.
+        ( rank <= 2 && R0_rev && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value )
+      ), typename SrcTraits::array_layout , Kokkos::LayoutStride
+      >::type array_layout ;
+
+  typedef typename SrcTraits::value_type  value_type ;
+
+  typedef typename std::conditional< rank == 0 , value_type ,
+          typename std::conditional< rank == 1 , value_type * ,
+          typename std::conditional< rank == 2 , value_type ** ,
+          typename std::conditional< rank == 3 , value_type *** ,
+          typename std::conditional< rank == 4 , value_type **** ,
+          typename std::conditional< rank == 5 , value_type ***** ,
+          typename std::conditional< rank == 6 , value_type ****** ,
+          typename std::conditional< rank == 7 , value_type ******* ,
+                                                 value_type ********
+          >::type >::type >::type >::type >::type >::type >::type >::type
+     data_type ;
+
+public:
+
+  typedef Kokkos::ViewTraits
+    < data_type
+    , array_layout
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > traits_type ;
+
+  typedef Kokkos::View
+    < data_type
+    , array_layout
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void assign( ViewMapping< traits_type , void > & dst
+                    , ViewMapping< SrcTraits , void > const & src
+                    , Args ... args )
+    {
+      typedef ViewMapping< traits_type , void >  DstType ;
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+      typedef typename DstType::handle_type  dst_handle_type ;
+
+      const SubviewExtents< SrcTraits::rank , rank >
+        extents( src.m_offset.m_dim , args... );
+
+      dst.m_offset = dst_offset_type( src.m_offset , extents );
+      dst.m_handle = dst_handle_type( src.m_handle +
+                                      src.m_offset( extents.domain_offset(0)
+                                                  , extents.domain_offset(1)
+                                                  , extents.domain_offset(2)
+                                                  , extents.domain_offset(3)
+                                                  , extents.domain_offset(4)
+                                                  , extents.domain_offset(5)
+                                                  , extents.domain_offset(6)
+                                                  , extents.domain_offset(7)
+                                                  ) );
+    }
+};
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f5b19f915fd81f26913884b158ca6be647520bf4
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@@ -0,0 +1,274 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_IMPL_VIEW_CTOR_PROP_HPP
+#define KOKKOS_EXPERIMENTAL_IMPL_VIEW_CTOR_PROP_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/* For backward compatibility */
+
+struct ViewAllocateWithoutInitializing {
+
+  const std::string label ;
+
+  ViewAllocateWithoutInitializing() : label() {}
+
+  explicit
+  ViewAllocateWithoutInitializing( const std::string & arg_label ) : label( arg_label ) {}
+
+  explicit
+  ViewAllocateWithoutInitializing( const char * const  arg_label ) : label( arg_label ) {}
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct WithoutInitializing_t {};
+struct AllowPadding_t {};
+struct NullSpace_t {};
+
+//----------------------------------------------------------------------------
+/**\brief Whether a type can be used for a view label */
+
+template < typename >
+struct is_view_label : public std::false_type {};
+
+template<>
+struct is_view_label< std::string > : public std::true_type {};
+
+template< unsigned N >
+struct is_view_label< char[N] > : public std::true_type {};
+
+template< unsigned N >
+struct is_view_label< const char[N] > : public std::true_type {};
+
+//----------------------------------------------------------------------------
+
+template< typename ... P >
+struct ViewCtorProp ;
+
+// Forward declare
+template< typename Specialize , typename T >
+struct CommonViewAllocProp ;
+
+/* Common value_type stored as ViewCtorProp
+ */
+template< typename Specialize , typename T >
+struct ViewCtorProp< void , CommonViewAllocProp<Specialize,T> >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  using type = CommonViewAllocProp<Specialize,T> ;
+
+  KOKKOS_INLINE_FUNCTION
+  ViewCtorProp( const type & arg ) : value( arg ) {}
+  KOKKOS_INLINE_FUNCTION
+  ViewCtorProp( type && arg ) : value( arg ) {}
+
+  type value ;
+};
+
+/*  std::integral_constant<unsigned,I> are dummy arguments
+ *  that avoid duplicate base class errors
+ */
+template< unsigned I >
+struct ViewCtorProp< void , std::integral_constant<unsigned,I> >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  template< typename P >
+  KOKKOS_INLINE_FUNCTION
+  ViewCtorProp( const P & ) {}
+};
+
+/* Property flags have constexpr value */
+template< typename P >
+struct ViewCtorProp
+  < typename std::enable_if<
+      std::is_same< P , AllowPadding_t >::value ||
+      std::is_same< P , WithoutInitializing_t >::value
+    >::type
+  , P
+  >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  typedef P type ;
+
+  ViewCtorProp( const type & ) {}
+
+  static constexpr type value = type();
+};
+
+/* Map input label type to std::string */
+template< typename Label >
+struct ViewCtorProp
+  < typename std::enable_if< is_view_label< Label >::value >::type
+  , Label
+  >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  typedef std::string type ;
+
+  ViewCtorProp( const type & arg ) : value( arg ) {}
+  ViewCtorProp( type && arg ) : value( arg ) {}
+
+  type value ;
+};
+
+template< typename Space >
+struct ViewCtorProp
+  < typename std::enable_if<
+      Kokkos::Impl::is_memory_space<Space>::value ||
+      Kokkos::Impl::is_execution_space<Space>::value
+    >::type
+  , Space
+  >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  typedef Space type ;
+
+  ViewCtorProp( const type & arg ) : value( arg ) {}
+
+  type value ;
+};
+
+
+template< typename T >
+struct ViewCtorProp < void , T * >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  typedef T * type ;
+
+  KOKKOS_INLINE_FUNCTION
+  ViewCtorProp( const type arg ) : value( arg ) {}
+
+  type value ;
+};
+
+
+template< typename ... P >
+struct ViewCtorProp : public ViewCtorProp< void , P > ...
+{
+private:
+
+  typedef Kokkos::Impl::has_condition< void , Kokkos::Impl::is_memory_space , P ... >
+    var_memory_space ;
+
+  typedef Kokkos::Impl::has_condition< void , Kokkos::Impl::is_execution_space , P ... >
+    var_execution_space ;
+
+  struct VOIDDUMMY{};
+
+  typedef Kokkos::Impl::has_condition< VOIDDUMMY , std::is_pointer , P ... >
+    var_pointer ;
+
+public:
+
+  /* Flags for the common properties */
+  enum { has_memory_space    = var_memory_space::value };
+  enum { has_execution_space = var_execution_space::value };
+  enum { has_pointer         = var_pointer::value };
+  enum { has_label           = Kokkos::Impl::has_type< std::string , P... >::value };
+  enum { allow_padding       = Kokkos::Impl::has_type< AllowPadding_t , P... >::value };
+  enum { initialize          = ! Kokkos::Impl::has_type< WithoutInitializing_t , P ... >::value };
+
+  typedef typename var_memory_space::type     memory_space ;
+  typedef typename var_execution_space::type  execution_space ;
+  typedef typename var_pointer::type          pointer_type ;
+
+  /*  Copy from a matching argument list.
+   *  Requires  std::is_same< P , ViewCtorProp< void , Args >::value ...
+   */
+  template< typename ... Args >
+  inline
+  ViewCtorProp( Args const & ... args )
+    : ViewCtorProp< void , P >( args ) ...
+    {}
+
+  template< typename ... Args >
+  KOKKOS_INLINE_FUNCTION
+  ViewCtorProp( pointer_type arg0 , Args const & ... args )
+    : ViewCtorProp< void , pointer_type >( arg0 )
+    , ViewCtorProp< void , typename ViewCtorProp< void , Args >::type >( args ) ...
+    {}
+
+  /* Copy from a matching property subset */
+  template< typename ... Args >
+  ViewCtorProp( ViewCtorProp< Args ... > const & arg )
+    : ViewCtorProp< void , Args >( ((ViewCtorProp<void,Args> const &) arg ) ) ...
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..39c6048958f944d6c66ce84ce86963e722d3645e
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@@ -0,0 +1,3285 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP
+#define KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP
+
+#include <type_traits>
+#include <initializer_list>
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_ViewCtor.hpp>
+#include <impl/Kokkos_Atomic_View.hpp>
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< unsigned I , size_t ... Args >
+struct variadic_size_t
+  { enum { value = ~size_t(0) }; };
+
+template< size_t Val , size_t ... Args >
+struct variadic_size_t< 0 , Val , Args ... >
+  { enum { value = Val }; };
+
+template< unsigned I , size_t Val , size_t ... Args >
+struct variadic_size_t< I , Val , Args ... >
+  { enum { value = variadic_size_t< I - 1 , Args ... >::value }; };
+
+template< size_t ... Args >
+struct rank_dynamic ;
+
+template<>
+struct rank_dynamic<> { enum { value = 0 }; };
+
+template< size_t Val , size_t ... Args >
+struct rank_dynamic< Val , Args... >
+{
+  enum { value = ( Val == 0 ? 1 : 0 ) + rank_dynamic< Args... >::value };
+};
+
+#define KOKKOS_IMPL_VIEW_DIMENSION( R ) \
+  template< size_t V , unsigned > struct ViewDimension ## R \
+    { \
+      enum { ArgN ## R = ( V != ~size_t(0) ? V : 1 ) }; \
+      enum { N ## R = ( V != ~size_t(0) ? V : 1 ) }; \
+      KOKKOS_INLINE_FUNCTION explicit ViewDimension ## R ( size_t ) {} \
+      ViewDimension ## R () = default ; \
+      ViewDimension ## R ( const ViewDimension ## R  & ) = default ; \
+      ViewDimension ## R & operator = ( const ViewDimension ## R  & ) = default ; \
+    }; \
+  template< unsigned RD > struct ViewDimension ## R < 0 , RD > \
+    { \
+      enum { ArgN ## R = 0 }; \
+      typename std::conditional<( RD < 3 ), size_t , unsigned >::type N ## R ; \
+      ViewDimension ## R () = default ; \
+      ViewDimension ## R ( const ViewDimension ## R  & ) = default ; \
+      ViewDimension ## R & operator = ( const ViewDimension ## R  & ) = default ; \
+      KOKKOS_INLINE_FUNCTION explicit ViewDimension ## R ( size_t V ) : N ## R ( V ) {} \
+    };
+
+KOKKOS_IMPL_VIEW_DIMENSION( 0 )
+KOKKOS_IMPL_VIEW_DIMENSION( 1 )
+KOKKOS_IMPL_VIEW_DIMENSION( 2 )
+KOKKOS_IMPL_VIEW_DIMENSION( 3 )
+KOKKOS_IMPL_VIEW_DIMENSION( 4 )
+KOKKOS_IMPL_VIEW_DIMENSION( 5 )
+KOKKOS_IMPL_VIEW_DIMENSION( 6 )
+KOKKOS_IMPL_VIEW_DIMENSION( 7 )
+
+#undef KOKKOS_IMPL_VIEW_DIMENSION
+
+template< size_t ... Vals >
+struct ViewDimension
+  : public ViewDimension0< variadic_size_t<0,Vals...>::value
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension1< variadic_size_t<1,Vals...>::value
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension2< variadic_size_t<2,Vals...>::value
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension3< variadic_size_t<3,Vals...>::value
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension4< variadic_size_t<4,Vals...>::value
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension5< variadic_size_t<5,Vals...>::value
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension6< variadic_size_t<6,Vals...>::value
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension7< variadic_size_t<7,Vals...>::value
+                         , rank_dynamic< Vals... >::value >
+{
+  typedef ViewDimension0< variadic_size_t<0,Vals...>::value
+                        , rank_dynamic< Vals... >::value > D0 ;
+  typedef ViewDimension1< variadic_size_t<1,Vals...>::value
+                        , rank_dynamic< Vals... >::value > D1 ;
+  typedef ViewDimension2< variadic_size_t<2,Vals...>::value
+                        , rank_dynamic< Vals... >::value > D2 ;
+  typedef ViewDimension3< variadic_size_t<3,Vals...>::value
+                        , rank_dynamic< Vals... >::value > D3 ;
+  typedef ViewDimension4< variadic_size_t<4,Vals...>::value
+                        , rank_dynamic< Vals... >::value > D4 ;
+  typedef ViewDimension5< variadic_size_t<5,Vals...>::value
+                        , rank_dynamic< Vals... >::value > D5 ;
+  typedef ViewDimension6< variadic_size_t<6,Vals...>::value
+                        , rank_dynamic< Vals... >::value > D6 ;
+  typedef ViewDimension7< variadic_size_t<7,Vals...>::value
+                        , rank_dynamic< Vals... >::value > D7 ;
+
+  using D0::ArgN0 ;
+  using D1::ArgN1 ;
+  using D2::ArgN2 ;
+  using D3::ArgN3 ;
+  using D4::ArgN4 ;
+  using D5::ArgN5 ;
+  using D6::ArgN6 ;
+  using D7::ArgN7 ;
+
+  using D0::N0 ;
+  using D1::N1 ;
+  using D2::N2 ;
+  using D3::N3 ;
+  using D4::N4 ;
+  using D5::N5 ;
+  using D6::N6 ;
+  using D7::N7 ;
+
+  enum { rank = sizeof...(Vals) };
+  enum { rank_dynamic = Impl::rank_dynamic< Vals... >::value };
+
+  ViewDimension() = default ;
+  ViewDimension( const ViewDimension & ) = default ;
+  ViewDimension & operator = ( const ViewDimension & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr
+  ViewDimension( size_t n0 , size_t n1 , size_t n2 , size_t n3
+               , size_t n4 , size_t n5 , size_t n6 , size_t n7 )
+    : D0( n0 )
+    , D1( n1 )
+    , D2( n2 )
+    , D3( n3 )
+    , D4( n4 )
+    , D5( n5 )
+    , D6( n6 )
+    , D7( n7 )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_t extent( const unsigned r ) const
+    {
+      return r == 0 ? N0 : (
+             r == 1 ? N1 : (
+             r == 2 ? N2 : (
+             r == 3 ? N3 : (
+             r == 4 ? N4 : (
+             r == 5 ? N5 : (
+             r == 6 ? N6 : (
+             r == 7 ? N7 : 0 )))))));
+    }
+
+  template< size_t N >
+  struct prepend { typedef ViewDimension< N , Vals... > type ; };
+
+  template< size_t N >
+  struct append { typedef ViewDimension< Vals... , N > type ; };
+};
+
+template< class A , class B >
+struct ViewDimensionJoin ;
+
+template< size_t ... A , size_t ... B >
+struct ViewDimensionJoin< ViewDimension< A... > , ViewDimension< B... > > {
+  typedef ViewDimension< A... , B... > type ;
+};
+
+//----------------------------------------------------------------------------
+
+template< class DstDim , class SrcDim >
+struct ViewDimensionAssignable ;
+
+template< size_t ... DstArgs , size_t ... SrcArgs >
+struct ViewDimensionAssignable< ViewDimension< DstArgs ... >
+                              , ViewDimension< SrcArgs ... > >
+{
+  typedef ViewDimension< DstArgs... > dst ;
+  typedef ViewDimension< SrcArgs... > src ;
+
+  enum { value =
+    unsigned(dst::rank) == unsigned(src::rank) && (
+      //Compile time check that potential static dimensions match
+      ( ( 1 > dst::rank_dynamic && 1 > src::rank_dynamic ) ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) : true ) &&
+      ( ( 2 > dst::rank_dynamic && 2 > src::rank_dynamic ) ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) : true ) &&
+      ( ( 3 > dst::rank_dynamic && 3 > src::rank_dynamic ) ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) : true ) &&
+      ( ( 4 > dst::rank_dynamic && 4 > src::rank_dynamic ) ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) : true ) &&
+      ( ( 5 > dst::rank_dynamic && 5 > src::rank_dynamic ) ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) : true ) &&
+      ( ( 6 > dst::rank_dynamic && 6 > src::rank_dynamic ) ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) : true ) &&
+      ( ( 7 > dst::rank_dynamic && 7 > src::rank_dynamic ) ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) : true ) &&
+      ( ( 8 > dst::rank_dynamic && 8 > src::rank_dynamic ) ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) : true )
+    )};
+
+};
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct ALL_t {
+  KOKKOS_INLINE_FUNCTION
+  constexpr const ALL_t & operator()() const { return *this ; }
+};
+
+}} // namespace Kokkos::Impl
+
+namespace Kokkos {
+namespace Impl {
+
+template< class T >
+struct is_integral_extent_type
+{ enum { value = std::is_same<T,Kokkos::Impl::ALL_t>::value ? 1 : 0 }; };
+
+template< class iType >
+struct is_integral_extent_type< std::pair<iType,iType> >
+{ enum { value = std::is_integral<iType>::value ? 1 : 0 }; };
+
+template< class iType >
+struct is_integral_extent_type< Kokkos::pair<iType,iType> >
+{ enum { value = std::is_integral<iType>::value ? 1 : 0 }; };
+
+// Assuming '2 == initializer_list<iType>::size()'
+template< class iType >
+struct is_integral_extent_type< std::initializer_list<iType> >
+{ enum { value = std::is_integral<iType>::value ? 1 : 0 }; };
+
+template < unsigned I , class ... Args >
+struct is_integral_extent
+{
+  // get_type is void when sizeof...(Args) <= I
+  typedef typename std::remove_cv<
+          typename std::remove_reference<
+          typename Kokkos::Impl::get_type<I,Args...
+          >::type >::type >::type type ;
+
+  enum { value = is_integral_extent_type<type>::value };
+
+  static_assert( value ||
+                 std::is_integral<type>::value ||
+                 std::is_same<type,void>::value
+               , "subview argument must be either integral or integral extent" );
+};
+
+// Rules for subview arguments and layouts matching
+
+template<class LayoutDest, class LayoutSrc, int RankDest, int RankSrc, int CurrentArg, class ... SubViewArgs>
+struct SubviewLegalArgsCompileTime;
+
+// Rules which allow LayoutLeft to LayoutLeft assignment
+
+template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs>
+struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> {
+  enum { value      =(((CurrentArg==RankDest-1) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ||
+                      ((CurrentArg>=RankDest) && (std::is_integral<Arg>::value)) ||
+                      ((CurrentArg<RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value)) ||
+                      ((CurrentArg==0) && (Kokkos::Impl::is_integral_extent_type<Arg>::value))
+                     ) && (SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)};
+};
+
+template<int RankDest, int RankSrc, int CurrentArg, class Arg>
+struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg, Arg> {
+  enum { value = ((CurrentArg==RankDest-1) || (std::is_integral<Arg>::value)) &&
+                 (CurrentArg==RankSrc-1) };
+};
+
+// Rules which allow LayoutRight to LayoutRight assignment
+
+template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs>
+struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> {
+  enum { value      =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ||
+                      ((CurrentArg<RankSrc-RankDest) && (std::is_integral<Arg>::value)) ||
+                      ((CurrentArg>=RankSrc-RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value))
+                     ) && (SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)};
+};
+
+template<int RankDest, int RankSrc, int CurrentArg, class Arg>
+struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg, Arg> {
+  enum { value = ((CurrentArg==RankSrc-1) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value)) };
+};
+
+// Rules which allow assignment to LayoutStride
+
+template<int RankDest, int RankSrc, int CurrentArg, class ... SubViewArgs>
+struct SubviewLegalArgsCompileTime<Kokkos::LayoutStride,Kokkos::LayoutLeft,RankDest,RankSrc,CurrentArg,SubViewArgs...> {
+  enum { value = true };
+};
+
+template<int RankDest, int RankSrc, int CurrentArg, class ... SubViewArgs>
+struct SubviewLegalArgsCompileTime<Kokkos::LayoutStride,Kokkos::LayoutRight,RankDest,RankSrc,CurrentArg,SubViewArgs...> {
+  enum { value = true };
+};
+
+template<int RankDest, int RankSrc, int CurrentArg, class ... SubViewArgs>
+struct SubviewLegalArgsCompileTime<Kokkos::LayoutStride,Kokkos::LayoutStride,RankDest,RankSrc,CurrentArg,SubViewArgs...> {
+  enum { value = true };
+};
+
+
+template< unsigned DomainRank , unsigned RangeRank >
+struct SubviewExtents {
+private:
+
+  // Cannot declare zero-length arrays
+  // '+' is used to silence GCC 7.2.0 -Wduplicated-branches warning when RangeRank=1
+  enum { InternalRangeRank = RangeRank ? RangeRank : +1u };
+
+  size_t   m_begin[  DomainRank ];
+  size_t   m_length[ InternalRangeRank ];
+  unsigned m_index[  InternalRangeRank ];
+
+  template< size_t ... DimArgs >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned
+          , unsigned
+          , const ViewDimension< DimArgs ... > & )
+    { return true ; }
+
+  template< class T , size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned domain_rank
+          , unsigned range_rank
+          , const ViewDimension< DimArgs ... > & dim
+          , const T & val
+          , Args ... args )
+    {
+      const size_t v = static_cast<size_t>(val);
+
+      m_begin[ domain_rank ] = v ;
+
+      return set( domain_rank + 1 , range_rank , dim , args... )
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+             && ( v < dim.extent( domain_rank ) )
+#endif
+      ;
+    }
+
+  // ALL_t
+  template< size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned domain_rank
+          , unsigned range_rank
+          , const ViewDimension< DimArgs ... > & dim
+          , const Kokkos::Impl::ALL_t
+          , Args ... args )
+    {
+      m_begin[  domain_rank ] = 0 ;
+      m_length[ range_rank  ] = dim.extent( domain_rank );
+      m_index[  range_rank  ] = domain_rank ;
+
+      return set( domain_rank + 1 , range_rank + 1 , dim , args... );
+    }
+
+  // std::pair range
+  template< class T , size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned domain_rank
+          , unsigned range_rank
+          , const ViewDimension< DimArgs ... > & dim
+          , const std::pair<T,T> & val
+          , Args ... args )
+    {
+      const size_t b = static_cast<size_t>( val.first );
+      const size_t e = static_cast<size_t>( val.second );
+
+      m_begin[  domain_rank ] = b ;
+      m_length[ range_rank  ] = e - b ;
+      m_index[  range_rank  ] = domain_rank ;
+
+      return set( domain_rank + 1 , range_rank + 1 , dim , args... )
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+             && ( e <= b + dim.extent( domain_rank ) )
+#endif
+      ;
+    }
+
+  // Kokkos::pair range
+  template< class T , size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned domain_rank
+          , unsigned range_rank
+          , const ViewDimension< DimArgs ... > & dim
+          , const Kokkos::pair<T,T> & val
+          , Args ... args )
+    {
+      const size_t b = static_cast<size_t>( val.first );
+      const size_t e = static_cast<size_t>( val.second );
+
+      m_begin[  domain_rank ] = b ;
+      m_length[ range_rank  ] = e - b ;
+      m_index[  range_rank  ] = domain_rank ;
+
+      return set( domain_rank + 1 , range_rank + 1 , dim , args... )
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+             && ( e <= b + dim.extent( domain_rank ) )
+#endif
+      ;
+    }
+
+  // { begin , end } range
+  template< class T , size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned domain_rank
+          , unsigned range_rank
+          , const ViewDimension< DimArgs ... > & dim
+          , const std::initializer_list< T > & val
+          , Args ... args )
+    {
+      const size_t b = static_cast<size_t>( val.begin()[0] );
+      const size_t e = static_cast<size_t>( val.begin()[1] );
+
+      m_begin[  domain_rank ] = b ;
+      m_length[ range_rank  ] = e - b ;
+      m_index[  range_rank  ] = domain_rank ;
+
+      return set( domain_rank + 1 , range_rank + 1 , dim , args... )
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+             && ( val.size() == 2 )
+             && ( e <= b + dim.extent( domain_rank ) )
+#endif
+      ;
+    }
+
+  //------------------------------
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+
+  template< size_t ... DimArgs >
+  void error( char *
+            , int
+            , unsigned
+            , unsigned
+            , const ViewDimension< DimArgs ... > & ) const
+    {}
+
+  template< class T , size_t ... DimArgs , class ... Args >
+  void error( char * buf , int buf_len
+            , unsigned domain_rank
+            , unsigned range_rank
+            , const ViewDimension< DimArgs ... > & dim
+            , const T & val
+            , Args ... args ) const
+    {
+      const int n = std::min( buf_len ,
+        snprintf( buf , buf_len
+                , " %lu < %lu %c"
+                , static_cast<unsigned long>(val)
+                , static_cast<unsigned long>( dim.extent( domain_rank ) )
+                , int( sizeof...(Args) ? ',' : ')' ) ) );
+
+      error( buf+n, buf_len-n, domain_rank + 1 , range_rank , dim , args... );
+    }
+
+  // std::pair range
+  template< size_t ... DimArgs , class ... Args >
+  void error( char * buf , int buf_len
+            , unsigned domain_rank
+            , unsigned range_rank
+            , const ViewDimension< DimArgs ... > & dim
+            , const Kokkos::Impl::ALL_t
+            , Args ... args ) const
+    {
+      const int n = std::min( buf_len ,
+        snprintf( buf , buf_len
+                , " Kokkos::ALL %c"
+                , int( sizeof...(Args) ? ',' : ')' ) ) );
+
+      error( buf+n , buf_len-n , domain_rank + 1 , range_rank + 1 , dim , args... );
+    }
+
+  // std::pair range
+  template< class T , size_t ... DimArgs , class ... Args >
+  void error( char * buf , int buf_len
+            , unsigned domain_rank
+            , unsigned range_rank
+            , const ViewDimension< DimArgs ... > & dim
+            , const std::pair<T,T> & val
+            , Args ... args ) const
+    {
+      // d <= e - b
+      const int n = std::min( buf_len ,
+        snprintf( buf , buf_len
+                , " %lu <= %lu - %lu %c"
+                , static_cast<unsigned long>( dim.extent( domain_rank ) )
+                , static_cast<unsigned long>( val.second )
+                , static_cast<unsigned long>( val.first )
+                , int( sizeof...(Args) ? ',' : ')' ) ) );
+
+      error( buf+n , buf_len-n , domain_rank + 1 , range_rank + 1 , dim , args... );
+    }
+
+  // Kokkos::pair range
+  template< class T , size_t ... DimArgs , class ... Args >
+  void error( char * buf , int buf_len
+            , unsigned domain_rank
+            , unsigned range_rank
+            , const ViewDimension< DimArgs ... > & dim
+            , const Kokkos::pair<T,T> & val
+            , Args ... args ) const
+    {
+      // d <= e - b
+      const int n = std::min( buf_len ,
+        snprintf( buf , buf_len
+                , " %lu <= %lu - %lu %c"
+                , static_cast<unsigned long>( dim.extent( domain_rank ) )
+                , static_cast<unsigned long>( val.second )
+                , static_cast<unsigned long>( val.first )
+                , int( sizeof...(Args) ? ',' : ')' ) ) );
+
+      error( buf+n , buf_len-n , domain_rank + 1 , range_rank + 1 , dim , args... );
+    }
+
+  // { begin , end } range
+  template< class T , size_t ... DimArgs , class ... Args >
+  void error( char * buf , int buf_len
+            , unsigned domain_rank
+            , unsigned range_rank
+            , const ViewDimension< DimArgs ... > & dim
+            , const std::initializer_list< T > & val
+            , Args ... args ) const
+    {
+      // d <= e - b
+      int n = 0 ;
+      if ( val.size() == 2 ) {
+        n = std::min( buf_len ,
+          snprintf( buf , buf_len
+                  , " %lu <= %lu - %lu %c"
+                  , static_cast<unsigned long>( dim.extent( domain_rank ) )
+                  , static_cast<unsigned long>( val.begin()[0] )
+                  , static_cast<unsigned long>( val.begin()[1] )
+                  , int( sizeof...(Args) ? ',' : ')' ) ) );
+      }
+      else {
+        n = std::min( buf_len ,
+          snprintf( buf , buf_len
+                  , " { ... }.size() == %u %c"
+                  , unsigned(val.size())
+                  , int( sizeof...(Args) ? ',' : ')' ) ) );
+      }
+
+      error( buf+n , buf_len-n , domain_rank + 1 , range_rank + 1 , dim , args... );
+    }
+
+  template< size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void error( const ViewDimension< DimArgs ... > & dim , Args ... args ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      enum { LEN = 1024 };
+      char buffer[ LEN ];
+
+      const int n = snprintf(buffer,LEN,"Kokkos::subview bounds error (");
+      error( buffer+n , LEN-n , 0 , 0 , dim , args... );
+
+      Kokkos::Impl::throw_runtime_exception(std::string(buffer));
+#else
+      Kokkos::abort("Kokkos::subview bounds error");
+#endif
+    }
+
+#else
+
+  template< size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void error( const ViewDimension< DimArgs ... > & , Args ... ) const {}
+
+#endif
+
+public:
+
+  template< size_t ... DimArgs , class ... Args >
+  KOKKOS_INLINE_FUNCTION
+  SubviewExtents( const ViewDimension< DimArgs ... > & dim , Args ... args )
+    {
+      static_assert( DomainRank == sizeof...(DimArgs) , "" );
+      static_assert( DomainRank == sizeof...(Args) , "" );
+
+      // Verifies that all arguments, up to 8, are integral types,
+      // integral extents, or don't exist.
+      static_assert( RangeRank ==
+        unsigned( is_integral_extent<0,Args...>::value ) +
+        unsigned( is_integral_extent<1,Args...>::value ) +
+        unsigned( is_integral_extent<2,Args...>::value ) +
+        unsigned( is_integral_extent<3,Args...>::value ) +
+        unsigned( is_integral_extent<4,Args...>::value ) +
+        unsigned( is_integral_extent<5,Args...>::value ) +
+        unsigned( is_integral_extent<6,Args...>::value ) +
+        unsigned( is_integral_extent<7,Args...>::value ) , "" );
+
+      if ( RangeRank == 0 ) { m_length[0] = 0 ; m_index[0] = ~0u ; }
+
+      if ( ! set( 0 , 0 , dim , args... ) ) error( dim , args... );
+    }
+
+  template < typename iType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  constexpr size_t domain_offset( const iType i ) const
+    { return unsigned(i) < DomainRank ? m_begin[i] : 0 ; }
+
+  template < typename iType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  constexpr size_t range_extent( const iType i ) const
+    { return unsigned(i) < InternalRangeRank ? m_length[i] : 0 ; }
+
+  template < typename iType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  constexpr unsigned range_index( const iType i ) const
+    { return unsigned(i) < InternalRangeRank ? m_index[i] : ~0u ; }
+};
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Given a value type and dimension generate the View data type */
+template< class T , class Dim >
+struct ViewDataType ;
+
+template< class T >
+struct ViewDataType< T , ViewDimension<> >
+{
+  typedef T type ;
+};
+
+template< class T , size_t ... Args >
+struct ViewDataType< T , ViewDimension< 0 , Args... > >
+{
+  typedef typename ViewDataType<T*,ViewDimension<Args...> >::type type ;
+};
+
+template< class T , size_t N , size_t ... Args >
+struct ViewDataType< T , ViewDimension< N , Args... > >
+{
+  typedef typename ViewDataType<T,ViewDimension<Args...> >::type type[N] ;
+};
+
+/**\brief  Analysis of View data type.
+ *
+ *  Data type conforms to one of the following patterns :
+ *    {const} value_type [][#][#][#]
+ *    {const} value_type ***[#][#][#]
+ *  Where the sum of counts of '*' and '[#]' is at most ten.
+ *
+ *  Provide typedef for the ViewDimension<...> and value_type.
+ */
+template< class T >
+struct ViewArrayAnalysis
+{
+  typedef T                                      value_type ;
+  typedef typename std::add_const<    T >::type  const_value_type ;
+  typedef typename std::remove_const< T >::type  non_const_value_type ;
+  typedef ViewDimension<>                        static_dimension ;
+  typedef ViewDimension<>                        dynamic_dimension ;
+  typedef ViewDimension<>                        dimension ;
+};
+
+template< class T , size_t N >
+struct ViewArrayAnalysis< T[N] >
+{
+private:
+  typedef ViewArrayAnalysis< T > nested ;
+public:
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::const_value_type      const_value_type ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+
+  typedef typename nested::static_dimension::template prepend<N>::type
+    static_dimension ;
+
+  typedef typename nested::dynamic_dimension dynamic_dimension ;
+
+  typedef typename
+    ViewDimensionJoin< dynamic_dimension , static_dimension >::type
+      dimension ;
+};
+
+template< class T >
+struct ViewArrayAnalysis< T[] >
+{
+private:
+  typedef ViewArrayAnalysis< T > nested ;
+  typedef typename nested::dimension nested_dimension ;
+public:
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::const_value_type      const_value_type ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+
+  typedef typename nested::dynamic_dimension::template prepend<0>::type
+    dynamic_dimension ;
+
+  typedef typename nested::static_dimension static_dimension ;
+
+  typedef typename
+    ViewDimensionJoin< dynamic_dimension , static_dimension >::type
+      dimension ;
+};
+
+template< class T >
+struct ViewArrayAnalysis< T* >
+{
+private:
+  typedef ViewArrayAnalysis< T > nested ;
+public:
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::const_value_type      const_value_type ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+
+  typedef typename nested::dynamic_dimension::template prepend<0>::type
+    dynamic_dimension ;
+
+  typedef typename nested::static_dimension static_dimension ;
+
+  typedef typename
+    ViewDimensionJoin< dynamic_dimension , static_dimension >::type
+      dimension ;
+};
+
+
+template< class DataType , class ArrayLayout , class ValueType >
+struct ViewDataAnalysis
+{
+private:
+
+  typedef ViewArrayAnalysis< DataType > array_analysis ;
+
+  // ValueType is opportunity for partial specialization.
+  // Must match array analysis when this default template is used.
+  static_assert( std::is_same< ValueType , typename array_analysis::non_const_value_type >::value , "" );
+
+public:
+
+  typedef void specialize ; // No specialization
+
+  typedef typename array_analysis::dimension             dimension ;
+  typedef typename array_analysis::value_type            value_type ;
+  typedef typename array_analysis::const_value_type      const_value_type ;
+  typedef typename array_analysis::non_const_value_type  non_const_value_type ;
+
+  // Generate analogous multidimensional array specification type.
+  typedef typename ViewDataType<           value_type , dimension >::type  type ;
+  typedef typename ViewDataType<     const_value_type , dimension >::type  const_type ;
+  typedef typename ViewDataType< non_const_value_type , dimension >::type  non_const_type ;
+
+  // Generate "flattened" multidimensional array specification type.
+  typedef type            scalar_array_type ;
+  typedef const_type      const_scalar_array_type ;
+  typedef non_const_type  non_const_scalar_array_type ;
+};
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template < class Dimension , class Layout , typename Enable = void >
+struct ViewOffset {
+  using is_mapping_plugin = std::false_type ;
+};
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutLeft
+                 , typename std::enable_if<( 1 >= Dimension::rank
+                                             ||
+                                             0 == Dimension::rank_dynamic
+                                           )>::type >
+{
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::true_type ;
+
+  typedef size_t             size_type ;
+  typedef Dimension          dimension_type ;
+  typedef Kokkos::LayoutLeft array_layout ;
+
+  dimension_type m_dim ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i0 + m_dim.N0 * i1 ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i0 + m_dim.N0 * ( i1 + m_dim.N1 * i2 );
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * i3 ));
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * i4 )));
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * i5 ))));
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * (
+           i6 + m_dim.N6 * i7 ))))));
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr array_layout layout() const
+    {
+      return array_layout( m_dim.N0 , m_dim.N1 , m_dim.N2 , m_dim.N3
+                         , m_dim.N4 , m_dim.N5 , m_dim.N6 , m_dim.N7 );
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return true ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N0 * m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < dimension_type::rank ) { s[1] = m_dim.N0 ; }
+      if ( 1 < dimension_type::rank ) { s[2] = s[1] * m_dim.N1 ; }
+      if ( 2 < dimension_type::rank ) { s[3] = s[2] * m_dim.N2 ; }
+      if ( 3 < dimension_type::rank ) { s[4] = s[3] * m_dim.N3 ; }
+      if ( 4 < dimension_type::rank ) { s[5] = s[4] * m_dim.N4 ; }
+      if ( 5 < dimension_type::rank ) { s[6] = s[5] * m_dim.N5 ; }
+      if ( 6 < dimension_type::rank ) { s[7] = s[6] * m_dim.N6 ; }
+      if ( 7 < dimension_type::rank ) { s[8] = s[7] * m_dim.N7 ; }
+    }
+
+  //----------------------------------------
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( std::integral_constant<unsigned,TrivialScalarSize> const &
+    , Kokkos::LayoutLeft const & arg_layout
+    )
+    : m_dim( arg_layout.dimension[0], 0, 0, 0, 0, 0, 0, 0 )
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    }
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert(
+        ( DimRHS::rank == 0 &&
+          dimension_type::rank == 0 ) ||
+        ( DimRHS::rank == 1 &&
+          dimension_type::rank == 1 &&
+          dimension_type::rank_dynamic == 1 )
+        , "ViewOffset LayoutLeft and LayoutRight are only compatible when rank <= 1" );
+    }
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( 
+        ( DimRHS::rank == 0 &&
+          dimension_type::rank == 0 ) ||
+        ( DimRHS::rank == 1 &&
+          dimension_type::rank == 1 &&
+          dimension_type::rank_dynamic == 1 )
+        , "ViewOffset LayoutLeft and LayoutStride are only compatible when rank <= 1" );
+      if ( rhs.m_stride.S0 != 1 ) {
+        Kokkos::abort("Kokkos::Impl::ViewOffset assignment of LayoutLeft from LayoutStride  requires stride == 1" );
+      }
+    }
+
+  //----------------------------------------
+  // Subview construction
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset(
+    const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & ,
+    const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub )
+    : m_dim( sub.range_extent(0), 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( ( 0 == dimension_type::rank ) ||
+                     ( 1 == dimension_type::rank && 1 == dimension_type::rank_dynamic && 1 <= DimRHS::rank )
+                   , "ViewOffset subview construction requires compatible rank" );
+    }
+};
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutLeft
+                 , typename std::enable_if<( 1 < Dimension::rank
+                                             &&
+                                             0 < Dimension::rank_dynamic
+                                           )>::type >
+{
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::true_type ;
+
+  typedef size_t             size_type ;
+  typedef Dimension          dimension_type ;
+  typedef Kokkos::LayoutLeft array_layout ;
+
+  dimension_type m_dim ;
+  size_type      m_stride ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i0 + m_stride * i1 ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i0 + m_stride * ( i1 + m_dim.N1 * i2 );
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * i3 ));
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * i4 )));
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * i5 ))));
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * (
+           i6 + m_dim.N6 * i7 ))))));
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr array_layout layout() const
+    {
+      return array_layout( m_dim.N0 , m_dim.N1 , m_dim.N2 , m_dim.N3
+                         , m_dim.N4 , m_dim.N5 , m_dim.N6 , m_dim.N7 );
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_stride == m_dim.N0 ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_stride ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_stride * m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_stride * m_dim.N1 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < dimension_type::rank ) { s[1] = m_stride ; }
+      if ( 1 < dimension_type::rank ) { s[2] = s[1] * m_dim.N1 ; }
+      if ( 2 < dimension_type::rank ) { s[3] = s[2] * m_dim.N2 ; }
+      if ( 3 < dimension_type::rank ) { s[4] = s[3] * m_dim.N3 ; }
+      if ( 4 < dimension_type::rank ) { s[5] = s[4] * m_dim.N4 ; }
+      if ( 5 < dimension_type::rank ) { s[6] = s[5] * m_dim.N5 ; }
+      if ( 6 < dimension_type::rank ) { s[7] = s[6] * m_dim.N6 ; }
+      if ( 7 < dimension_type::rank ) { s[8] = s[7] * m_dim.N7 ; }
+    }
+
+  //----------------------------------------
+
+private:
+
+  template< unsigned TrivialScalarSize >
+  struct Padding {
+    enum { div = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT / ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+    enum { mod = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT % ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+
+    // If memory alignment is a multiple of the trivial scalar size then attempt to align.
+    enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
+    enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr
+
+    KOKKOS_INLINE_FUNCTION
+    static constexpr size_t stride( size_t const N )
+    {
+      return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) )
+             ? N + align - ( N % div_ok ) : N ;
+    }
+  };
+
+public:
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  /* Enable padding for trivial scalar types with non-zero trivial scalar size */
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( std::integral_constant<unsigned,TrivialScalarSize> const &
+    , Kokkos::LayoutLeft const & arg_layout
+    )
+    : m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
+           , arg_layout.dimension[2] , arg_layout.dimension[3]
+           , arg_layout.dimension[4] , arg_layout.dimension[5]
+           , arg_layout.dimension[6] , arg_layout.dimension[7]
+           )
+    , m_stride( Padding<TrivialScalarSize>::stride( arg_layout.dimension[0] ) )
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    , m_stride( rhs.stride_1() )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    }
+
+  //----------------------------------------
+  // Subview construction
+  // This subview must be 2 == rank and 2 == rank_dynamic
+  // due to only having stride #0.
+  // The source dimension #0 must be non-zero for stride-one leading dimension.
+  // At most subsequent dimension can be non-zero.
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ,
+      const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub )
+    : m_dim( sub.range_extent(0)
+           , sub.range_extent(1)
+           , sub.range_extent(2)
+           , sub.range_extent(3)
+           , sub.range_extent(4)
+           , sub.range_extent(5)
+           , sub.range_extent(6)
+           , sub.range_extent(7))
+    , m_stride( ( 1 == sub.range_index(1) ? rhs.stride_1() :
+                ( 2 == sub.range_index(1) ? rhs.stride_2() :
+                ( 3 == sub.range_index(1) ? rhs.stride_3() :
+                ( 4 == sub.range_index(1) ? rhs.stride_4() :
+                ( 5 == sub.range_index(1) ? rhs.stride_5() :
+                ( 6 == sub.range_index(1) ? rhs.stride_6() :
+                ( 7 == sub.range_index(1) ? rhs.stride_7() : 0 ))))))))
+    {
+      //static_assert( ( 2 == dimension_type::rank ) &&
+      //               ( 2 == dimension_type::rank_dynamic ) &&
+      //               ( 2 <= DimRHS::rank )
+      //             , "ViewOffset subview construction requires compatible rank" );
+    }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutRight
+                 , typename std::enable_if<( 1 >= Dimension::rank
+                                             ||
+                                             0 == Dimension::rank_dynamic
+                                           )>::type >
+{
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::true_type ;
+
+  typedef size_t              size_type ;
+  typedef Dimension           dimension_type ;
+  typedef Kokkos::LayoutRight array_layout ;
+
+  dimension_type m_dim ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i1 + m_dim.N1 * i0 ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i2 + m_dim.N2 * ( i1 + m_dim.N1 * ( i0 ));
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 )));
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 ))));
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 )))));
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 ))))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i7 + m_dim.N7 * (
+           i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 )))))));
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr array_layout layout() const
+    {
+      return array_layout( m_dim.N0 , m_dim.N1 , m_dim.N2 , m_dim.N3
+                         , m_dim.N4 , m_dim.N5 , m_dim.N6 , m_dim.N7 );
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return true ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N7 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N7 * m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < dimension_type::rank ) { s[7] = n ; n *= m_dim.N7 ; }
+      if ( 6 < dimension_type::rank ) { s[6] = n ; n *= m_dim.N6 ; }
+      if ( 5 < dimension_type::rank ) { s[5] = n ; n *= m_dim.N5 ; }
+      if ( 4 < dimension_type::rank ) { s[4] = n ; n *= m_dim.N4 ; }
+      if ( 3 < dimension_type::rank ) { s[3] = n ; n *= m_dim.N3 ; }
+      if ( 2 < dimension_type::rank ) { s[2] = n ; n *= m_dim.N2 ; }
+      if ( 1 < dimension_type::rank ) { s[1] = n ; n *= m_dim.N1 ; }
+      if ( 0 < dimension_type::rank ) { s[0] = n ; }
+      s[dimension_type::rank] = n * m_dim.N0 ;
+    }
+
+  //----------------------------------------
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( std::integral_constant<unsigned,TrivialScalarSize> const &
+    , Kokkos::LayoutRight const & arg_layout
+    )
+    : m_dim( arg_layout.dimension[0], 0, 0, 0, 0, 0, 0, 0 )
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    }
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert(
+       ( DimRHS::rank == 0 &&
+         dimension_type::rank == 0 ) ||
+       ( DimRHS::rank == 1 &&
+         dimension_type::rank == 1 &&
+         dimension_type::rank_dynamic == 1 )
+      , "ViewOffset LayoutRight and LayoutLeft are only compatible when rank <= 1" );
+    }
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert(
+       ( DimRHS::rank == 0 &&
+         dimension_type::rank == 0 ) ||
+       ( DimRHS::rank == 1 &&
+         dimension_type::rank == 1 &&
+         dimension_type::rank_dynamic == 1 )
+      , "ViewOffset LayoutRight and LayoutString are only compatible when rank <= 1" );
+      if ( rhs.m_stride.S0 != 1 ) {
+        Kokkos::abort("Kokkos::Impl::ViewOffset assignment of LayoutLeft/Right from LayoutStride  requires stride == 1" );
+      }
+    }
+
+  //----------------------------------------
+  // Subview construction
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs
+    , const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub
+    )
+    : m_dim( sub.range_extent(0) , 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( ( 0 == dimension_type::rank_dynamic ) ||
+                     ( 1 == dimension_type::rank && 1 == dimension_type::rank_dynamic && 1 <= DimRHS::rank )
+                   , "ViewOffset subview construction requires compatible rank" );
+    }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutRight
+                 , typename std::enable_if<( 1 < Dimension::rank
+                                             &&
+                                             0 < Dimension::rank_dynamic
+                                           )>::type >
+{
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::true_type ;
+
+  typedef size_t               size_type ;
+  typedef Dimension            dimension_type ;
+  typedef Kokkos::LayoutRight  array_layout ;
+
+  dimension_type m_dim ;
+  size_type      m_stride ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+  { return i1 + i0 * m_stride ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  { return i2 + m_dim.N2 * ( i1 ) + i0 * m_stride ; }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 )) +
+           i0 * m_stride ;
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 ))) +
+           i0 * m_stride ;
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 )))) +
+           i0 * m_stride ;
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 ))))) +
+           i0 * m_stride ;
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i7 + m_dim.N7 * (
+           i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 )))))) +
+           i0 * m_stride ;
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr array_layout layout() const
+    {
+      return array_layout( m_dim.N0 , m_dim.N1 , m_dim.N2 , m_dim.N3
+                         , m_dim.N4 , m_dim.N5 , m_dim.N6 , m_dim.N7 );
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_dim.N0 * m_stride ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const
+    { return m_stride == m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1 ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N7 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N7 * m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < dimension_type::rank ) { s[7] = n ; n *= m_dim.N7 ; }
+      if ( 6 < dimension_type::rank ) { s[6] = n ; n *= m_dim.N6 ; }
+      if ( 5 < dimension_type::rank ) { s[5] = n ; n *= m_dim.N5 ; }
+      if ( 4 < dimension_type::rank ) { s[4] = n ; n *= m_dim.N4 ; }
+      if ( 3 < dimension_type::rank ) { s[3] = n ; n *= m_dim.N3 ; }
+      if ( 2 < dimension_type::rank ) { s[2] = n ; n *= m_dim.N2 ; }
+      if ( 1 < dimension_type::rank ) { s[1] = n ; }
+      if ( 0 < dimension_type::rank ) { s[0] = m_stride ; }
+      s[dimension_type::rank] = m_stride * m_dim.N0 ;
+    }
+
+  //----------------------------------------
+
+private:
+
+  template< unsigned TrivialScalarSize >
+  struct Padding {
+    enum { div = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT / ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+    enum { mod = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT % ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+
+    // If memory alignment is a multiple of the trivial scalar size then attempt to align.
+    enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
+    enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr
+
+    KOKKOS_INLINE_FUNCTION
+    static constexpr size_t stride( size_t const N )
+    {
+      return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) )
+             ? N + align - ( N % div_ok ) : N ;
+    }
+  };
+
+public:
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  /* Enable padding for trivial scalar types with non-zero trivial scalar size.  */
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( std::integral_constant<unsigned,TrivialScalarSize> const &
+    , Kokkos::LayoutRight const & arg_layout
+    )
+    : m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
+           , arg_layout.dimension[2] , arg_layout.dimension[3]
+           , arg_layout.dimension[4] , arg_layout.dimension[5]
+           , arg_layout.dimension[6] , arg_layout.dimension[7]
+           )
+    , m_stride( Padding<TrivialScalarSize>::
+                  stride( /* 2 <= rank */
+                          m_dim.N1 * ( dimension_type::rank == 2 ? 1 :
+                          m_dim.N2 * ( dimension_type::rank == 3 ? 1 :
+                          m_dim.N3 * ( dimension_type::rank == 4 ? 1 :
+                          m_dim.N4 * ( dimension_type::rank == 5 ? 1 :
+                          m_dim.N5 * ( dimension_type::rank == 6 ? 1 :
+                          m_dim.N6 * ( dimension_type::rank == 7 ? 1 : m_dim.N7 )))))) ))
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    , m_stride( rhs.stride_0() )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    }
+
+  //----------------------------------------
+  // Subview construction
+  // Last dimension must be non-zero
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs
+    , const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub
+    )
+    : m_dim( sub.range_extent(0)
+           , sub.range_extent(1)
+           , sub.range_extent(2)
+           , sub.range_extent(3)
+           , sub.range_extent(4)
+           , sub.range_extent(5)
+           , sub.range_extent(6)
+           , sub.range_extent(7))
+    , m_stride( 0 == sub.range_index(0) ? rhs.stride_0() : (
+                1 == sub.range_index(0) ? rhs.stride_1() : (
+                2 == sub.range_index(0) ? rhs.stride_2() : (
+                3 == sub.range_index(0) ? rhs.stride_3() : (
+                4 == sub.range_index(0) ? rhs.stride_4() : (
+                5 == sub.range_index(0) ? rhs.stride_5() : (
+                6 == sub.range_index(0) ? rhs.stride_6() : 0 )))))))
+    {
+/*      // This subview must be 2 == rank and 2 == rank_dynamic
+      // due to only having stride #0.
+      // The source dimension #0 must be non-zero for stride-one leading dimension.
+      // At most subsequent dimension can be non-zero.
+
+      static_assert( (( 2 == dimension_type::rank ) &&
+                      ( 2 <= DimRHS::rank )) ||
+                     ()
+                   , "ViewOffset subview construction requires compatible rank" );
+*/
+    }
+};
+
+//----------------------------------------------------------------------------
+/* Strided array layout only makes sense for 0 < rank */
+/* rank = 0 included for DynRankView case */
+
+template< unsigned Rank >
+struct ViewStride ;
+
+template<>
+struct ViewStride<0> {
+  enum { S0 = 0 , S1 = 0 , S2 = 0 , S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t , size_t , size_t , size_t
+                      , size_t , size_t , size_t , size_t )
+    {}
+};
+
+template<>
+struct ViewStride<1> {
+  size_t S0 ;
+  enum { S1 = 0 , S2 = 0 , S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t , size_t , size_t
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 )
+    {}
+};
+
+template<>
+struct ViewStride<2> {
+  size_t S0 , S1 ;
+  enum { S2 = 0 , S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t , size_t
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 )
+    {}
+};
+
+template<>
+struct ViewStride<3> {
+  size_t S0 , S1 , S2 ;
+  enum { S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 )
+    {}
+};
+
+template<>
+struct ViewStride<4> {
+  size_t S0 , S1 , S2 , S3 ;
+  enum { S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    {}
+};
+
+template<>
+struct ViewStride<5> {
+  size_t S0 , S1 , S2 , S3 , S4 ;
+  enum { S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 )
+    {}
+};
+
+template<>
+struct ViewStride<6> {
+  size_t S0 , S1 , S2 , S3 , S4 , S5 ;
+  enum { S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t aS5 , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 ) , S5( aS5 )
+    {}
+};
+
+template<>
+struct ViewStride<7> {
+  size_t S0 , S1 , S2 , S3 , S4 , S5 , S6 ;
+  enum { S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t aS5 , size_t aS6 , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 ) , S5( aS5 ) , S6( aS6 )
+    {}
+};
+
+template<>
+struct ViewStride<8> {
+  size_t S0 , S1 , S2 , S3 , S4 , S5 , S6 , S7 ;
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t aS5 , size_t aS6 , size_t aS7 )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 ) , S5( aS5 ) , S6( aS6 ) , S7( aS7 )
+    {}
+};
+
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutStride
+                 , void >
+{
+private:
+  typedef ViewStride< Dimension::rank >  stride_type ;
+public:
+
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::true_type ;
+
+  typedef size_t                size_type ;
+  typedef Dimension             dimension_type ;
+  typedef Kokkos::LayoutStride  array_layout ;
+
+  dimension_type  m_dim ;
+  stride_type     m_stride ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const
+  {
+    return i0 * m_stride.S0 ;
+  }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 ;
+  }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 ;
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 ;
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 ;
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 +
+           i5 * m_stride.S5 ;
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 +
+           i5 * m_stride.S5 +
+           i6 * m_stride.S6 ;
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 +
+           i5 * m_stride.S5 +
+           i6 * m_stride.S6 +
+           i7 * m_stride.S7 ;
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr array_layout layout() const
+    {
+      return array_layout( m_dim.N0 , m_stride.S0
+                         , m_dim.N1 , m_stride.S1
+                         , m_dim.N2 , m_stride.S2
+                         , m_dim.N3 , m_stride.S3
+                         , m_dim.N4 , m_stride.S4
+                         , m_dim.N5 , m_stride.S5
+                         , m_dim.N6 , m_stride.S6
+                         , m_dim.N7 , m_stride.S7
+                         );
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+private:
+
+  KOKKOS_INLINE_FUNCTION
+  static constexpr size_type Max( size_type lhs , size_type rhs )
+    { return lhs < rhs ? rhs : lhs ; }
+
+public:
+
+  /* Span of the range space, largest stride * dimension */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    {
+      return Max( m_dim.N0 * m_stride.S0 ,
+             Max( m_dim.N1 * m_stride.S1 ,
+             Max( m_dim.N2 * m_stride.S2 ,
+             Max( m_dim.N3 * m_stride.S3 ,
+             Max( m_dim.N4 * m_stride.S4 ,
+             Max( m_dim.N5 * m_stride.S5 ,
+             Max( m_dim.N6 * m_stride.S6 ,
+                  m_dim.N7 * m_stride.S7 )))))));
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return span() == size(); }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride.S0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_stride.S1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_stride.S2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_stride.S3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_stride.S4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_stride.S5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_stride.S6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_stride.S7 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      if ( 0 < dimension_type::rank ) { s[0] = m_stride.S0 ; }
+      if ( 1 < dimension_type::rank ) { s[1] = m_stride.S1 ; }
+      if ( 2 < dimension_type::rank ) { s[2] = m_stride.S2 ; }
+      if ( 3 < dimension_type::rank ) { s[3] = m_stride.S3 ; }
+      if ( 4 < dimension_type::rank ) { s[4] = m_stride.S4 ; }
+      if ( 5 < dimension_type::rank ) { s[5] = m_stride.S5 ; }
+      if ( 6 < dimension_type::rank ) { s[6] = m_stride.S6 ; }
+      if ( 7 < dimension_type::rank ) { s[7] = m_stride.S7 ; }
+      s[dimension_type::rank] = span();
+    }
+
+  //----------------------------------------
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( std::integral_constant<unsigned,0> const &
+                      , Kokkos::LayoutStride const & rhs )
+    : m_dim( rhs.dimension[0] , rhs.dimension[1] , rhs.dimension[2] , rhs.dimension[3]
+           , rhs.dimension[4] , rhs.dimension[5] , rhs.dimension[6] , rhs.dimension[7] )
+    , m_stride( rhs.stride[0] , rhs.stride[1] , rhs.stride[2] , rhs.stride[3]
+              , rhs.stride[4] , rhs.stride[5] , rhs.stride[6] , rhs.stride[7] )
+    {}
+
+  template< class DimRHS , class LayoutRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , LayoutRHS , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    , m_stride( rhs.stride_0() , rhs.stride_1() , rhs.stride_2() , rhs.stride_3()
+              , rhs.stride_4() , rhs.stride_5() , rhs.stride_6() , rhs.stride_7() )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    }
+
+  //----------------------------------------
+  // Subview construction
+
+private:
+
+  template< class DimRHS , class LayoutRHS >
+  KOKKOS_INLINE_FUNCTION static
+  constexpr size_t stride
+    ( unsigned r , const ViewOffset< DimRHS , LayoutRHS , void > & rhs )
+    {
+      return r >  7 ? 0 : (
+             r == 0 ? rhs.stride_0() : (
+             r == 1 ? rhs.stride_1() : (
+             r == 2 ? rhs.stride_2() : (
+             r == 3 ? rhs.stride_3() : (
+             r == 4 ? rhs.stride_4() : (
+             r == 5 ? rhs.stride_5() : (
+             r == 6 ? rhs.stride_6() : rhs.stride_7() )))))));
+    }
+
+public:
+
+  template< class DimRHS , class LayoutRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( const ViewOffset< DimRHS , LayoutRHS , void > & rhs
+    , const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub
+    )
+    // range_extent(r) returns 0 when dimension_type::rank <= r
+    : m_dim( sub.range_extent(0)
+           , sub.range_extent(1)
+           , sub.range_extent(2)
+           , sub.range_extent(3)
+           , sub.range_extent(4)
+           , sub.range_extent(5)
+           , sub.range_extent(6)
+           , sub.range_extent(7)
+           )
+    // range_index(r) returns ~0u when dimension_type::rank <= r
+    , m_stride( stride( sub.range_index(0), rhs )
+              , stride( sub.range_index(1), rhs )
+              , stride( sub.range_index(2), rhs )
+              , stride( sub.range_index(3), rhs )
+              , stride( sub.range_index(4), rhs )
+              , stride( sub.range_index(5), rhs )
+              , stride( sub.range_index(6), rhs )
+              , stride( sub.range_index(7), rhs )
+              )
+    {}
+};
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  ViewDataHandle provides the type of the 'data handle' which the view
+ *          uses to access data with the [] operator. It also provides
+ *          an allocate function and a function to extract a raw ptr from the
+ *          data handle. ViewDataHandle also defines an enum ReferenceAble which
+ *          specifies whether references/pointers to elements can be taken and a
+ *          'return_type' which is what the view operators will give back.
+ *          Specialisation of this object allows three things depending
+ *          on ViewTraits and compiler options:
+ *          (i)   Use special allocator (e.g. huge pages/small pages and pinned memory)
+ *          (ii)  Use special data handle type (e.g. add Cuda Texture Object)
+ *          (iii) Use special access intrinsics (e.g. texture fetch and non-caching loads)
+ */
+template< class Traits , class Enable = void >
+struct ViewDataHandle {
+
+  typedef typename Traits::value_type   value_type  ;
+  typedef typename Traits::value_type * handle_type ;
+  typedef typename Traits::value_type & return_type ;
+  typedef Kokkos::Impl::SharedAllocationTracker  track_type  ;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr
+                           , track_type const & /*arg_tracker*/ )
+  {
+    return handle_type( arg_data_ptr );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( handle_type const arg_data_ptr
+                           , size_t offset )
+  {
+    return handle_type( arg_data_ptr + offset );
+  }
+};
+
+template< class Traits >
+struct ViewDataHandle< Traits ,
+  typename std::enable_if<( std::is_same< typename Traits::non_const_value_type
+                                        , typename Traits::value_type >::value
+                            &&
+                            std::is_same< typename Traits::specialize , void >::value
+                            &&
+                            Traits::memory_traits::Atomic
+                          )>::type >
+{
+  typedef typename Traits::value_type  value_type ;
+  typedef typename Kokkos::Impl::AtomicViewDataHandle< Traits >  handle_type ;
+  typedef typename Kokkos::Impl::AtomicDataElement< Traits >     return_type ;
+  typedef Kokkos::Impl::SharedAllocationTracker                  track_type  ;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr
+                           , track_type const & /*arg_tracker*/ )
+  {
+    return handle_type( arg_data_ptr );
+  }
+
+  template<class SrcHandleType>
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( const SrcHandleType& arg_handle
+                           , size_t offset )
+  {
+    return handle_type( arg_handle.ptr + offset );
+  }
+};
+
+template< class Traits >
+struct ViewDataHandle< Traits ,
+  typename std::enable_if<(
+                            std::is_same< typename Traits::specialize , void >::value
+                            &&
+                            (!Traits::memory_traits::Aligned)
+                            &&
+                            Traits::memory_traits::Restrict
+#ifdef KOKKOS_ENABLE_CUDA
+                            &&
+                            (!( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
+                                std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value ))
+#endif
+                            &&
+                            (!Traits::memory_traits::Atomic)
+                          )>::type >
+{
+  typedef typename Traits::value_type  value_type ;
+  typedef typename Traits::value_type * KOKKOS_RESTRICT handle_type ;
+  typedef typename Traits::value_type & KOKKOS_RESTRICT return_type ;
+  typedef Kokkos::Impl::SharedAllocationTracker  track_type  ;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr
+                           , track_type const & /*arg_tracker*/ )
+  {
+    return handle_type( arg_data_ptr );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( handle_type const arg_data_ptr
+                           , size_t offset )
+  {
+    return handle_type( arg_data_ptr + offset );
+  }
+};
+
+template< class Traits >
+struct ViewDataHandle< Traits ,
+  typename std::enable_if<(
+                            std::is_same< typename Traits::specialize , void >::value
+                            &&
+                            Traits::memory_traits::Aligned
+			    &&
+                            (!Traits::memory_traits::Restrict)
+#ifdef KOKKOS_ENABLE_CUDA
+                            &&
+                            (!( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
+                                std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value ))
+#endif
+                            &&
+                            (!Traits::memory_traits::Atomic)
+                          )>::type >
+{
+  typedef typename Traits::value_type  value_type ;
+  typedef typename Traits::value_type * KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
+  typedef typename Traits::value_type & return_type ;
+  typedef Kokkos::Impl::SharedAllocationTracker  track_type  ;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr
+                           , track_type const & /*arg_tracker*/ )
+  {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
+      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
+    }
+    return handle_type( arg_data_ptr );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( handle_type const arg_data_ptr
+                           , size_t offset )
+  {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
+      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
+    }
+    return handle_type( arg_data_ptr + offset );
+  }
+};
+
+template< class Traits >
+struct ViewDataHandle< Traits ,
+  typename std::enable_if<(
+                            std::is_same< typename Traits::specialize , void >::value
+                            &&
+                            Traits::memory_traits::Aligned
+                            &&
+                            Traits::memory_traits::Restrict
+#ifdef KOKKOS_ENABLE_CUDA
+                            &&
+                            (!( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
+                                std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value ))
+#endif
+                            &&
+                            (!Traits::memory_traits::Atomic)
+                          )>::type >
+{
+  typedef typename Traits::value_type  value_type ;
+  typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
+  typedef typename Traits::value_type & return_type ;
+  typedef Kokkos::Impl::SharedAllocationTracker  track_type  ;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr
+                           , track_type const & /*arg_tracker*/ )
+  {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
+      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
+    }
+    return handle_type( arg_data_ptr );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( handle_type const arg_data_ptr
+                           , size_t offset )
+  {
+    if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
+      Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
+    }
+    return handle_type( arg_data_ptr + offset );
+  }
+};
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+/*
+ *  The construction, assignment to default, and destruction
+ *  are merged into a single functor.
+ *  Primarily to work around an unresolved CUDA back-end bug
+ *  that would lose the destruction cuda device function when
+ *  called from the shared memory tracking destruction.
+ *  Secondarily to have two fewer partial specializations.
+ */
+template< class ExecSpace
+        , class ValueType
+        , bool IsScalar = std::is_scalar< ValueType >::value
+        >
+struct ViewValueFunctor ;
+
+template< class ExecSpace , class ValueType >
+struct ViewValueFunctor< ExecSpace , ValueType , false /* is_scalar */ >
+{
+  typedef Kokkos::RangePolicy< ExecSpace > PolicyType ;
+  typedef typename ExecSpace::execution_space Exec;
+
+  Exec        space ;
+  ValueType * ptr ;
+  size_t      n ;
+  bool        destroy ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i ) const
+    {
+      if ( destroy ) { (ptr+i)->~ValueType(); } //KOKKOS_IMPL_CUDA_CLANG_WORKAROUND this line causes ptax error __cxa_begin_catch in nested_view unit-test
+      else           { new (ptr+i) ValueType(); }
+    }
+
+  ViewValueFunctor() = default ;
+  ViewValueFunctor( const ViewValueFunctor & ) = default ;
+  ViewValueFunctor & operator = ( const ViewValueFunctor & ) = default ;
+
+  ViewValueFunctor( ExecSpace   const & arg_space
+                  , ValueType * const arg_ptr
+                  , size_t      const arg_n )
+    : space( arg_space )
+    , ptr( arg_ptr )
+    , n( arg_n )
+    , destroy( false )
+    {}
+
+  void execute( bool arg )
+    {
+      destroy = arg ;
+      if ( ! space.in_parallel() ) {
+#if defined(KOKKOS_ENABLE_PROFILING)
+        uint64_t kpID = 0;
+        if(Kokkos::Profiling::profileLibraryLoaded()) {
+          Kokkos::Profiling::beginParallelFor("Kokkos::View::initialization", 0, &kpID);
+        }
+#endif
+        const Kokkos::Impl::ParallelFor< ViewValueFunctor , PolicyType >
+          closure( *this , PolicyType( 0 , n ) );
+        closure.execute();
+        space.fence();
+#if defined(KOKKOS_ENABLE_PROFILING)
+        if(Kokkos::Profiling::profileLibraryLoaded()) {
+          Kokkos::Profiling::endParallelFor(kpID);
+        }
+#endif
+      }
+      else {
+        for ( size_t i = 0 ; i < n ; ++i ) operator()(i);
+      }
+    }
+
+  void construct_shared_allocation()
+    { execute( false ); }
+
+  void destroy_shared_allocation()
+    { execute( true ); }
+};
+
+
+template< class ExecSpace , class ValueType >
+struct ViewValueFunctor< ExecSpace , ValueType , true /* is_scalar */ >
+{
+  typedef Kokkos::RangePolicy< ExecSpace > PolicyType ;
+
+  ExecSpace   space ;
+  ValueType * ptr ;
+  size_t      n ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i ) const
+    { ptr[i] = ValueType(); }
+
+  ViewValueFunctor() = default ;
+  ViewValueFunctor( const ViewValueFunctor & ) = default ;
+  ViewValueFunctor & operator = ( const ViewValueFunctor & ) = default ;
+
+  ViewValueFunctor( ExecSpace   const & arg_space
+                  , ValueType * const arg_ptr
+                  , size_t      const arg_n )
+    : space( arg_space )
+    , ptr( arg_ptr )
+    , n( arg_n )
+    {}
+
+  void construct_shared_allocation()
+    {
+      if ( ! space.in_parallel() ) {
+#if defined(KOKKOS_ENABLE_PROFILING)
+        uint64_t kpID = 0;
+        if(Kokkos::Profiling::profileLibraryLoaded()) {
+          Kokkos::Profiling::beginParallelFor("Kokkos::View::initialization", 0, &kpID);
+        }
+#endif
+        const Kokkos::Impl::ParallelFor< ViewValueFunctor , PolicyType >
+          closure( *this , PolicyType( 0 , n ) );
+        closure.execute();
+        space.fence();
+#if defined(KOKKOS_ENABLE_PROFILING)
+        if(Kokkos::Profiling::profileLibraryLoaded()) {
+          Kokkos::Profiling::endParallelFor(kpID);
+        }
+#endif
+      }
+      else {
+        for ( size_t i = 0 ; i < n ; ++i ) operator()(i);
+      }
+    }
+
+  void destroy_shared_allocation() {}
+};
+
+//----------------------------------------------------------------------------
+/** \brief  View mapping for non-specialized data type and standard layout */
+template< class Traits >
+class ViewMapping< Traits ,
+  typename std::enable_if<(
+    std::is_same< typename Traits::specialize , void >::value
+    &&
+    ViewOffset< typename Traits::dimension
+              , typename Traits::array_layout
+              , void >::is_mapping_plugin::value
+  )>::type >
+{
+private:
+
+  template< class , class ... > friend class ViewMapping ;
+  template< class , class ... > friend class Kokkos::View ;
+
+  typedef ViewOffset< typename Traits::dimension
+                    , typename Traits::array_layout
+                    , void
+                    >  offset_type ;
+
+  typedef typename ViewDataHandle< Traits >::handle_type  handle_type ;
+
+  handle_type  m_handle ;
+  offset_type  m_offset ;
+
+  KOKKOS_INLINE_FUNCTION
+  ViewMapping( const handle_type & arg_handle , const offset_type & arg_offset )
+    : m_handle( arg_handle )
+    , m_offset( arg_offset )
+    {}
+
+public:
+
+  typedef void printable_label_typedef;
+  enum { is_managed = Traits::is_managed };
+
+  //----------------------------------------
+  // Domain dimensions
+
+  enum { Rank = Traits::dimension::rank };
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr size_t extent( const iType & r ) const
+    { return m_offset.m_dim.extent(r); }
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename Traits::array_layout layout() const
+    { return m_offset.layout(); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_offset.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_offset.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_offset.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_offset.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_offset.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_offset.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_offset.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_offset.dimension_7(); }
+
+  // Is a regular layout with uniform striding for each index.
+  using is_regular = typename offset_type::is_regular ;
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_offset.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_offset.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_offset.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_offset.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_offset.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_offset.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_offset.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_offset.stride_7(); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_offset.stride(s); }
+
+  //----------------------------------------
+  // Range span
+
+  /** \brief  Span of the mapped range */
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_offset.span(); }
+
+  /** \brief  Is the mapped range span contiguous */
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_offset.span_is_contiguous(); }
+
+  typedef typename ViewDataHandle< Traits >::return_type  reference_type ;
+  typedef typename Traits::value_type *                   pointer_type ;
+
+  /** \brief  Query raw pointer to memory */
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const
+    {
+      return m_handle;
+    }
+
+  //----------------------------------------
+  // The View class performs all rank and bounds checking before
+  // calling these element reference methods.
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference() const { return m_handle[0]; }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename
+    std::enable_if< std::is_integral<I0>::value &&
+                    ! std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value
+                  , reference_type >::type
+  reference( const I0 & i0 ) const { return m_handle[i0]; }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename
+    std::enable_if< std::is_integral<I0>::value &&
+                    std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value
+                  , reference_type >::type
+  reference( const I0 & i0 ) const { return m_handle[ m_offset(i0) ]; }
+
+  template< typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 ) const
+    { return m_handle[ m_offset(i0,i1) ]; }
+
+  template< typename I0 , typename I1 , typename I2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 ) const
+    { return m_handle[ m_offset(i0,i1,i2) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 , const I6 & i6 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5,i6) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ]; }
+
+  //----------------------------------------
+
+private:
+
+  enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ };
+  enum { MemorySpanSize = sizeof(typename Traits::value_type) };
+
+public:
+
+  /** \brief  Span, in bytes, of the referenced memory */
+  KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const
+    {
+      return ( m_offset.span() * sizeof(typename Traits::value_type) + MemorySpanMask ) & ~size_t(MemorySpanMask);
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION ~ViewMapping() {}
+  KOKKOS_INLINE_FUNCTION ViewMapping() : m_handle(), m_offset() {}
+  KOKKOS_INLINE_FUNCTION ViewMapping( const ViewMapping & rhs )
+    : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( const ViewMapping & rhs )
+    { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION ViewMapping( ViewMapping && rhs )
+    : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( ViewMapping && rhs )
+    { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; return *this ; }
+
+  //----------------------------------------
+
+  /**\brief  Span, in bytes, of the required memory */
+  KOKKOS_INLINE_FUNCTION
+  static constexpr size_t memory_span( typename Traits::array_layout const & arg_layout )
+    {
+      typedef std::integral_constant< unsigned , 0 >  padding ;
+      return ( offset_type( padding(), arg_layout ).span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
+    }
+
+  /**\brief  Wrap a span of memory */
+  template< class ... P >
+  KOKKOS_INLINE_FUNCTION
+  ViewMapping( Kokkos::Impl::ViewCtorProp< P ... > const & arg_prop
+             , typename Traits::array_layout const & arg_layout
+             )
+    : m_handle( ( (Kokkos::Impl::ViewCtorProp<void,pointer_type> const &) arg_prop ).value )
+    , m_offset( std::integral_constant< unsigned , 0 >() , arg_layout )
+    {}
+
+  /**\brief  Assign data */
+  KOKKOS_INLINE_FUNCTION
+  void assign_data( pointer_type arg_ptr )
+    { m_handle = handle_type( arg_ptr ); }
+
+  //----------------------------------------
+  /*  Allocate and construct mapped array.
+   *  Allocate via shared allocation record and
+   *  return that record for allocation tracking.
+   */
+  template< class ... P >
+  Kokkos::Impl::SharedAllocationRecord<> *
+  allocate_shared( Kokkos::Impl::ViewCtorProp< P... > const & arg_prop
+                 , typename Traits::array_layout const & arg_layout )
+  {
+    typedef Kokkos::Impl::ViewCtorProp< P... > alloc_prop ;
+
+    typedef typename alloc_prop::execution_space  execution_space ;
+    typedef typename Traits::memory_space         memory_space ;
+    typedef typename Traits::value_type           value_type ;
+    typedef ViewValueFunctor< execution_space , value_type > functor_type ;
+    typedef Kokkos::Impl::SharedAllocationRecord< memory_space , functor_type > record_type ;
+
+    // Query the mapping for byte-size of allocation.
+    // If padding is allowed then pass in sizeof value type
+    // for padding computation.
+    typedef std::integral_constant
+      < unsigned
+      , alloc_prop::allow_padding ? sizeof(value_type) : 0
+      > padding ;
+
+    m_offset = offset_type( padding(), arg_layout );
+
+    const size_t alloc_size =
+      ( m_offset.span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
+
+    // Create shared memory tracking record with allocate memory from the memory space
+    record_type * const record =
+      record_type::allocate( ( (Kokkos::Impl::ViewCtorProp<void,memory_space> const &) arg_prop ).value
+                           , ( (Kokkos::Impl::ViewCtorProp<void,std::string>  const &) arg_prop ).value
+                           , alloc_size );
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+    if ( alloc_size ) {
+#endif
+    m_handle = handle_type( reinterpret_cast< pointer_type >( record->data() ) );
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+    }
+#endif
+
+    //  Only initialize if the allocation is non-zero.
+    //  May be zero if one of the dimensions is zero.
+    if ( alloc_size && alloc_prop::initialize ) {
+      // Assume destruction is only required when construction is requested.
+      // The ViewValueFunctor has both value construction and destruction operators.
+      record->m_destroy = functor_type( ( (Kokkos::Impl::ViewCtorProp<void,execution_space> const &) arg_prop).value
+                                      , (value_type *) m_handle
+                                      , m_offset.span()
+                                      );
+
+      // Construct values
+      record->m_destroy.construct_shared_allocation();
+    }
+
+    return record ;
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/** \brief  Assign compatible default mappings */
+
+template< class DstTraits , class SrcTraits >
+class ViewMapping< DstTraits , SrcTraits ,
+  typename std::enable_if<(
+    /* default mappings */
+    std::is_same< typename DstTraits::specialize , void >::value
+    &&
+    std::is_same< typename SrcTraits::specialize , void >::value
+    &&
+    (
+      /* same layout */
+      std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value
+      ||
+      /* known layout */
+      (
+        (
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+        &&
+        (
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+      )
+    )
+  )>::type >
+{
+private:
+
+  enum { is_assignable_space =
+#if 1
+   Kokkos::Impl::MemorySpaceAccess
+     < typename DstTraits::memory_space
+     , typename SrcTraits::memory_space >::assignable };
+#else
+   std::is_same< typename DstTraits::memory_space
+               , typename SrcTraits::memory_space >::value };
+#endif
+
+  enum { is_assignable_value_type =
+    std::is_same< typename DstTraits::value_type
+                , typename SrcTraits::value_type >::value ||
+    std::is_same< typename DstTraits::value_type
+                , typename SrcTraits::const_value_type >::value };
+
+  enum { is_assignable_dimension =
+    ViewDimensionAssignable< typename DstTraits::dimension
+                           , typename SrcTraits::dimension >::value };
+
+  enum { is_assignable_layout =
+    std::is_same< typename DstTraits::array_layout
+                , typename SrcTraits::array_layout >::value ||
+    std::is_same< typename DstTraits::array_layout
+                , Kokkos::LayoutStride >::value ||
+    ( DstTraits::dimension::rank == 0 ) ||
+    ( DstTraits::dimension::rank == 1 &&
+      DstTraits::dimension::rank_dynamic == 1 )
+    };
+
+public:
+
+  enum { is_assignable = is_assignable_space &&
+                         is_assignable_value_type &&
+                         is_assignable_dimension &&
+                         is_assignable_layout };
+
+  typedef Kokkos::Impl::SharedAllocationTracker  TrackType ;
+  typedef ViewMapping< DstTraits , void >  DstType ;
+  typedef ViewMapping< SrcTraits , void >  SrcType ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
+    {
+      static_assert( is_assignable_space
+                   , "View assignment must have compatible spaces" );
+
+      static_assert( is_assignable_value_type
+                   , "View assignment must have same value type or const = non-const" );
+
+      static_assert( is_assignable_dimension
+                   , "View assignment must have compatible dimensions" );
+
+      static_assert( is_assignable_layout
+                   , "View assignment must have compatible layout or have rank <= 1" );
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+
+      if ( size_t(DstTraits::dimension::rank_dynamic) < size_t(SrcTraits::dimension::rank_dynamic) ) {
+        typedef typename DstTraits::dimension dst_dim;
+        bool assignable =
+          ( ( 1 > DstTraits::dimension::rank_dynamic && 1 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN0 == src.dimension_0() : true ) &&
+          ( ( 2 > DstTraits::dimension::rank_dynamic && 2 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN1 == src.dimension_1() : true ) &&
+          ( ( 3 > DstTraits::dimension::rank_dynamic && 3 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN2 == src.dimension_2() : true ) &&
+          ( ( 4 > DstTraits::dimension::rank_dynamic && 4 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN3 == src.dimension_3() : true ) &&
+          ( ( 5 > DstTraits::dimension::rank_dynamic && 5 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN4 == src.dimension_4() : true ) &&
+          ( ( 6 > DstTraits::dimension::rank_dynamic && 6 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN5 == src.dimension_5() : true ) &&
+          ( ( 7 > DstTraits::dimension::rank_dynamic && 7 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN6 == src.dimension_6() : true ) &&
+          ( ( 8 > DstTraits::dimension::rank_dynamic && 8 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN7 == src.dimension_7() : true )
+          ;
+        if(!assignable)
+          Kokkos::abort("View Assignment: trying to assign runtime dimension to non matching compile time dimension.");
+      }
+      dst.m_offset = dst_offset_type( src.m_offset );
+      dst.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track );
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Subview mapping.
+// Deduce destination view type from source view traits and subview arguments
+
+template< class SrcTraits , class ... Args >
+struct ViewMapping
+  < typename std::enable_if<(
+      std::is_same< typename SrcTraits::specialize , void >::value
+      &&
+      (
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutLeft >::value ||
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutRight >::value ||
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutStride >::value
+      )
+    )>::type
+  , SrcTraits
+  , Args ... >
+{
+private:
+
+  static_assert( SrcTraits::rank == sizeof...(Args) ,
+    "Subview mapping requires one argument for each dimension of source View" );
+
+  enum
+    { RZ = false
+    , R0 = bool(is_integral_extent<0,Args...>::value)
+    , R1 = bool(is_integral_extent<1,Args...>::value)
+    , R2 = bool(is_integral_extent<2,Args...>::value)
+    , R3 = bool(is_integral_extent<3,Args...>::value)
+    , R4 = bool(is_integral_extent<4,Args...>::value)
+    , R5 = bool(is_integral_extent<5,Args...>::value)
+    , R6 = bool(is_integral_extent<6,Args...>::value)
+    , R7 = bool(is_integral_extent<7,Args...>::value)
+    };
+
+  enum { rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+              + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
+
+  // Whether right-most rank is a range.
+  enum { R0_rev = ( 0 == SrcTraits::rank ? RZ : (
+                    1 == SrcTraits::rank ? R0 : (
+                    2 == SrcTraits::rank ? R1 : (
+                    3 == SrcTraits::rank ? R2 : (
+                    4 == SrcTraits::rank ? R3 : (
+                    5 == SrcTraits::rank ? R4 : (
+                    6 == SrcTraits::rank ? R5 : (
+                    7 == SrcTraits::rank ? R6 : R7 )))))))) };
+
+  // Subview's layout
+  typedef typename std::conditional<
+      ( /* Same array layout IF */
+        ( rank == 0 ) /* output rank zero */
+        ||
+        SubviewLegalArgsCompileTime<typename SrcTraits::array_layout, typename SrcTraits::array_layout,
+                                    rank, SrcTraits::rank, 0, Args...>::value
+        ||
+        // OutputRank 1 or 2, InputLayout Left, Interval 0
+        // because single stride one or second index has a stride.
+        ( rank <= 2 && R0 && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ) //replace with input rank
+        ||
+        // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
+        // because single stride one or second index has a stride.
+        ( rank <= 2 && R0_rev && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ) //replace input rank
+      ), typename SrcTraits::array_layout , Kokkos::LayoutStride
+      >::type array_layout ;
+
+  typedef typename SrcTraits::value_type  value_type ;
+
+  typedef typename std::conditional< rank == 0 , value_type ,
+          typename std::conditional< rank == 1 , value_type * ,
+          typename std::conditional< rank == 2 , value_type ** ,
+          typename std::conditional< rank == 3 , value_type *** ,
+          typename std::conditional< rank == 4 , value_type **** ,
+          typename std::conditional< rank == 5 , value_type ***** ,
+          typename std::conditional< rank == 6 , value_type ****** ,
+          typename std::conditional< rank == 7 , value_type ******* ,
+                                                 value_type ********
+          >::type >::type >::type >::type >::type >::type >::type >::type
+     data_type ;
+
+public:
+
+  typedef Kokkos::ViewTraits
+    < data_type
+    , array_layout
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > traits_type ;
+
+  typedef Kokkos::View
+    < data_type
+    , array_layout
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > type ;
+
+  template< class MemoryTraits >
+  struct apply {
+
+    static_assert( Kokkos::Impl::is_memory_traits< MemoryTraits >::value , "" );
+
+    typedef Kokkos::ViewTraits
+      < data_type
+      , array_layout
+      , typename SrcTraits::device_type
+      , MemoryTraits > traits_type ;
+
+    typedef Kokkos::View
+      < data_type
+      , array_layout
+      , typename SrcTraits::device_type
+      , MemoryTraits > type ;
+  };
+
+  // The presumed type is 'ViewMapping< traits_type , void >'
+  // However, a compatible ViewMapping is acceptable.
+  template< class DstTraits >
+  KOKKOS_INLINE_FUNCTION
+  static void assign( ViewMapping< DstTraits , void > & dst
+                    , ViewMapping< SrcTraits , void > const & src
+                    , Args ... args )
+    {
+      static_assert(
+        ViewMapping< DstTraits , traits_type , void >::is_assignable ,
+        "Subview destination type must be compatible with subview derived type" );
+
+      typedef ViewMapping< DstTraits , void >  DstType ;
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+
+      const SubviewExtents< SrcTraits::rank , rank >
+        extents( src.m_offset.m_dim , args... );
+
+      dst.m_offset = dst_offset_type( src.m_offset , extents );
+
+      dst.m_handle = ViewDataHandle< DstTraits >::assign(src.m_handle,
+          src.m_offset( extents.domain_offset(0)
+                      , extents.domain_offset(1)
+                      , extents.domain_offset(2)
+                      , extents.domain_offset(3)
+                      , extents.domain_offset(4)
+                      , extents.domain_offset(5)
+                      , extents.domain_offset(6)
+                      , extents.domain_offset(7)
+          ));
+    }
+};
+
+
+
+//----------------------------------------------------------------------------
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< unsigned , class MapType >
+KOKKOS_INLINE_FUNCTION
+bool view_verify_operator_bounds( const MapType & )
+{ return true ; }
+
+template< unsigned R , class MapType , class iType , class ... Args >
+KOKKOS_INLINE_FUNCTION
+bool view_verify_operator_bounds
+  ( const MapType & map
+  , const iType   & i
+  , Args ... args
+  )
+{
+  return ( size_t(i) < map.extent(R) )
+         && view_verify_operator_bounds<R+1>( map , args ... );
+}
+
+template< unsigned , class MapType >
+inline
+void view_error_operator_bounds( char * , int , const MapType & )
+{}
+
+template< unsigned R , class MapType , class iType , class ... Args >
+inline
+void view_error_operator_bounds
+  ( char * buf
+  , int len
+  , const MapType & map
+  , const iType   & i
+  , Args ... args
+  )
+{
+  const int n =
+    snprintf(buf,len," %ld < %ld %c"
+            , static_cast<unsigned long>(i)
+            , static_cast<unsigned long>( map.extent(R) )
+            , ( sizeof...(Args) ? ',' : ')' )
+            );
+  view_error_operator_bounds<R+1>(buf+n,len-n,map,args...);
+}
+
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+/* Check #3: is the View managed as determined by the MemoryTraits? */
+template< class MapType,
+  bool is_managed = (MapType::is_managed != 0) >
+struct OperatorBoundsErrorOnDevice;
+
+template< class MapType >
+struct OperatorBoundsErrorOnDevice< MapType, false > {
+KOKKOS_INLINE_FUNCTION
+static void run(MapType const&) {
+  Kokkos::abort("View bounds error");
+}
+};
+
+template< class MapType >
+struct OperatorBoundsErrorOnDevice< MapType, true > {
+KOKKOS_INLINE_FUNCTION
+static void run(MapType const& map) {
+  SharedAllocationHeader const* const header =
+    SharedAllocationHeader::get_header((void*)(map.data()));
+  char const* const label = header->label();
+  enum { LEN = 128 };
+  char msg[LEN];
+  char const* const first_part = "View bounds error of view ";
+  char* p = msg;
+  char* const end = msg + LEN - 1;
+  for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) {
+    *p = *p2;
+  }
+  for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) {
+    *p = *p2;
+  }
+  *p = '\0';
+  Kokkos::abort(msg);
+}
+};
+
+/* Check #2: does the ViewMapping have the printable_label_typedef defined?
+   See above that only the non-specialized standard-layout ViewMapping has
+   this defined by default.
+   The existence of this typedef indicates the existence of MapType::is_managed */
+template< class T, class Enable = void >
+struct has_printable_label_typedef : public std::false_type {};
+
+template<class T>
+struct has_printable_label_typedef<
+  T, typename enable_if_type<typename T::printable_label_typedef>::type>
+  : public std::true_type
+{};
+
+template< class MapType >
+KOKKOS_INLINE_FUNCTION
+void operator_bounds_error_on_device(
+    MapType const&,
+    std::false_type) {
+  Kokkos::abort("View bounds error");
+}
+
+template< class MapType >
+KOKKOS_INLINE_FUNCTION
+void operator_bounds_error_on_device(
+    MapType const& map,
+    std::true_type) {
+  OperatorBoundsErrorOnDevice< MapType >::run(map);
+}
+
+#endif // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+template< class MemorySpace , class MapType , class ... Args >
+KOKKOS_INLINE_FUNCTION
+void view_verify_operator_bounds
+  ( Kokkos::Impl::SharedAllocationTracker const & tracker
+  , const MapType & map , Args ... args )
+{
+  if ( ! view_verify_operator_bounds<0>( map , args ... ) ) {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    enum { LEN = 1024 };
+    char buffer[ LEN ];
+    const std::string label = tracker.template get_label<MemorySpace>();
+    int n = snprintf(buffer,LEN,"View bounds error of view %s (",label.c_str());
+    view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... );
+    Kokkos::Impl::throw_runtime_exception(std::string(buffer));
+#else
+    /* Check #1: is there a SharedAllocationRecord?
+       (we won't use it, but if its not there then there isn't
+        a corresponding SharedAllocationHeader containing a label).
+       This check should cover the case of Views that don't
+       have the Unmanaged trait but were initialized by pointer. */
+    if (tracker.has_record()) {
+      operator_bounds_error_on_device<MapType>(
+          map, has_printable_label_typedef<MapType>());
+    } else {
+      Kokkos::abort("View bounds error");
+    }
+#endif
+  }
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewTile.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewTile.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..19f7127d57b27e037ea47f25b9f0f4ce1082a817
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewTile.hpp
@@ -0,0 +1,223 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_VIEWTILE_HPP
+#define KOKKOS_EXPERIMENTAL_VIEWTILE_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// View mapping for rank two tiled array
+
+template< class L >
+struct is_layout_tile : public std::false_type {};
+
+template< unsigned N0 , unsigned N1 >
+struct is_layout_tile< Kokkos::LayoutTileLeft<N0,N1,true> > : public std::true_type {};
+
+template< class Dimension , class Layout >
+struct ViewOffset< Dimension , Layout ,
+  typename std::enable_if<(
+    ( Dimension::rank == 2 )
+    &&
+    is_layout_tile< Layout >::value
+  )>::type >
+{
+public:
+
+  enum { SHIFT_0 = Kokkos::Impl::integral_power_of_two(Layout::N0) };
+  enum { SHIFT_1 = Kokkos::Impl::integral_power_of_two(Layout::N1) };
+  enum { SHIFT_T = SHIFT_0 + SHIFT_1 };
+  enum { MASK_0  = Layout::N0 - 1 };
+  enum { MASK_1  = Layout::N1 - 1 };
+
+  // Is an irregular layout that does not have uniform striding for each index.
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::false_type ;
+
+  typedef size_t     size_type ;
+  typedef Dimension  dimension_type ;
+  typedef Layout     array_layout ;
+
+  dimension_type m_dim ;
+  size_type      m_tile_N0 ;
+
+  //----------------------------------------
+
+  // Only instantiated for rank 2
+  template< typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1
+                      , int = 0 , int = 0
+                      , int = 0 , int = 0
+                      , int = 0 , int = 0
+                      ) const
+    {
+      return /* ( ( Tile offset                               ) * Tile size ) */
+                ( ( (i0>>SHIFT_0) + m_tile_N0 * (i1>>SHIFT_1) ) << SHIFT_T) +
+             /* ( Offset within tile                       ) */
+                ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ;
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  array_layout layout() const
+    { return array_layout( m_dim.N0 , m_dim.N1 ); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_dim.N0 * m_dim.N1 ; }
+
+  // Strides are meaningless due to irregularity
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 0 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type span() const
+    {
+      // ( TileDim0 * ( TileDim1 ) ) * TileSize
+      return ( m_tile_N0 * ( ( m_dim.N1 + MASK_1 ) >> SHIFT_1 ) ) << SHIFT_T ;
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const
+    {
+      // Only if dimensions align with tile size
+      return ( m_dim.N0 & MASK_0 ) == 0 && ( m_dim.N1 & MASK_1 ) == 0 ;
+    }
+
+  //----------------------------------------
+
+  KOKKOS_FUNCTION_DEFAULTED ~ViewOffset() = default ;
+  KOKKOS_FUNCTION_DEFAULTED ViewOffset() = default ;
+  KOKKOS_FUNCTION_DEFAULTED ViewOffset( const ViewOffset & ) = default ;
+  KOKKOS_FUNCTION_DEFAULTED ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( std::integral_constant<unsigned,TrivialScalarSize> const & ,
+                        array_layout const arg_layout )
+    : m_dim( arg_layout.dimension[0], arg_layout.dimension[1], 0, 0, 0, 0, 0, 0 )
+    , m_tile_N0( ( arg_layout.dimension[0] + MASK_0 ) >> SHIFT_0 /* number of tiles in first dimension */ )
+    {}
+};
+
+template< typename T , unsigned N0 , unsigned N1 , class ... P
+        , typename iType0 , typename iType1
+        >
+struct ViewMapping
+  < void
+  , Kokkos::ViewTraits<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...>
+  , Kokkos::LayoutTileLeft<N0,N1,true>
+  , iType0
+  , iType1 >
+{
+  typedef Kokkos::LayoutTileLeft<N0,N1,true>  src_layout ;
+  typedef Kokkos::ViewTraits< T** , src_layout , P... > src_traits ;
+  typedef Kokkos::ViewTraits< T[N0][N1] , LayoutLeft , P ... > traits ;
+  typedef Kokkos::View< T[N0][N1] , LayoutLeft , P ... > type ;
+
+  KOKKOS_INLINE_FUNCTION static
+  void assign( ViewMapping< traits , void > & dst
+             , const ViewMapping< src_traits , void > & src
+             , const src_layout &
+             , const size_t i_tile0
+             , const size_t i_tile1
+             )
+    {
+      typedef ViewMapping< traits , void >        dst_map_type ;
+      typedef ViewMapping< src_traits , void >    src_map_type ;
+      typedef typename dst_map_type::handle_type  dst_handle_type ;
+      typedef typename dst_map_type::offset_type  dst_offset_type ;
+      typedef typename src_map_type::offset_type  src_offset_type ;
+
+      dst = dst_map_type(
+         dst_handle_type( src.m_handle +
+                        ( ( i_tile0 + src.m_offset.m_tile_N0 * i_tile1 ) << src_offset_type::SHIFT_T ) ) ,
+         dst_offset_type() );
+    }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+
+template< typename T , unsigned N0 , unsigned N1 , class ... P >
+KOKKOS_INLINE_FUNCTION
+Kokkos::View< T[N0][N1] , LayoutLeft , P... >
+tile_subview( const Kokkos::View<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...> & src
+            , const size_t i_tile0
+            , const size_t i_tile1
+            )
+{
+  // Force the specialized ViewMapping for extracting a tile
+  // by using the first subview argument as the layout.
+  typedef Kokkos::LayoutTileLeft<N0,N1,true> SrcLayout ;
+
+  return Kokkos::View< T[N0][N1] , LayoutLeft , P... >
+    ( src , SrcLayout() , i_tile0 , i_tile1 );
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXPERIENTAL_VIEWTILE_HPP */
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp b/packages/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f36ffc8addefbd610a71be2ce86037a35270bc61
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
@@ -0,0 +1,240 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_VOLATILE_LOAD_HPP )
+#define KOKKOS_VOLATILE_LOAD_HPP
+
+#if defined( __GNUC__ ) /* GNU C   */ || \
+    defined( __GNUG__ ) /* GNU C++ */ || \
+    defined( __clang__ )
+
+#define KOKKOS_IMPL_MAY_ALIAS __attribute__((__may_alias__))
+
+#else
+
+#define KOKKOS_IMPL_MAY_ALIAS
+
+#endif
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+T volatile_load(T const volatile * const src_ptr)
+{
+  typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_IMPL_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const volatile * const ptr;
+    T64 const volatile * const ptr64;
+    T32 const volatile * const ptr32;
+    T16 const volatile * const ptr16;
+    T8  const volatile * const ptr8;
+  } src = {src_ptr};
+
+  T result;
+
+  union {
+    T   * const ptr;
+    T64 * const ptr64;
+    T32 * const ptr32;
+    T16 * const ptr16;
+    T8  * const ptr8;
+  } dst = {&result};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+
+  return result;
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * const dst_ptr, T const volatile * const src_ptr)
+{
+  typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_IMPL_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const volatile * const ptr;
+    T64 const volatile * const ptr64;
+    T32 const volatile * const ptr32;
+    T16 const volatile * const ptr16;
+    T8  const volatile * const ptr8;
+  } src = {src_ptr};
+
+  union {
+    T   volatile * const ptr;
+    T64 volatile * const ptr64;
+    T32 volatile * const ptr32;
+    T16 volatile * const ptr16;
+    T8  volatile * const ptr8;
+  } dst = {dst_ptr};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * const dst_ptr, T const * const src_ptr)
+{
+  typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_IMPL_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const * const ptr;
+    T64 const * const ptr64;
+    T32 const * const ptr32;
+    T16 const * const ptr16;
+    T8  const * const ptr8;
+  } src = {src_ptr};
+
+  union {
+    T   volatile * const ptr;
+    T64 volatile * const ptr64;
+    T32 volatile * const ptr32;
+    T16 volatile * const ptr16;
+    T8  volatile * const ptr8;
+  } dst = {dst_ptr};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * dst_ptr, T const volatile & src)
+{ volatile_store(dst_ptr, &src); }
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * dst_ptr, T const & src)
+{ volatile_store(dst_ptr, &src); }
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+T safe_load(T const * const ptr)
+{
+#if !defined( __MIC__ )
+  return *ptr;
+#else
+  return volatile_load(ptr);
+#endif
+}
+
+} // namespace kokkos
+
+#undef KOKKOS_IMPL_MAY_ALIAS
+
+#endif
+
diff --git a/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp b/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7dc8a5356691c6609329882cfee55abaaebd9ae3
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_hwloc.cpp
@@ -0,0 +1,730 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define DEBUG_PRINT 0
+
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_hwloc.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace hwloc {
+
+/* Return 0 if asynchronous, 1 if synchronous and include process. */
+unsigned thread_mapping( const char * const label ,
+                         const bool allow_async ,
+                         unsigned & thread_count ,
+                         unsigned & use_numa_count ,
+                         unsigned & use_cores_per_numa ,
+                         std::pair<unsigned,unsigned> threads_coord[] )
+{
+  const bool     hwloc_avail            = Kokkos::hwloc::available();
+  const unsigned avail_numa_count       = hwloc_avail ? hwloc::get_available_numa_count() : 1 ;
+  const unsigned avail_cores_per_numa   = hwloc_avail ? hwloc::get_available_cores_per_numa() : thread_count ;
+  const unsigned avail_threads_per_core = hwloc_avail ? hwloc::get_available_threads_per_core() : 1 ;
+
+  // (numa,core) coordinate of the process:
+  const std::pair<unsigned,unsigned> proc_coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+  //------------------------------------------------------------------------
+  // Defaults for unspecified inputs:
+
+  if ( ! use_numa_count ) {
+    // Default to use all NUMA regions
+    use_numa_count = ! thread_count ? avail_numa_count : (
+                       thread_count < avail_numa_count ? thread_count : avail_numa_count );
+  }
+
+  if ( ! use_cores_per_numa ) {
+    // Default to use all but one core if asynchronous, all cores if synchronous.
+    const unsigned threads_per_numa = thread_count / use_numa_count ;
+
+    use_cores_per_numa = ! threads_per_numa ? avail_cores_per_numa - ( allow_async ? 1 : 0 ) : (
+                           threads_per_numa < avail_cores_per_numa ? threads_per_numa : avail_cores_per_numa );
+  }
+
+  if ( ! thread_count ) {
+    thread_count = use_numa_count * use_cores_per_numa * avail_threads_per_core ;
+  }
+
+  //------------------------------------------------------------------------
+  // Input verification:
+
+  const bool valid_numa      = use_numa_count <= avail_numa_count ;
+  const bool valid_cores     = use_cores_per_numa &&
+                               use_cores_per_numa <= avail_cores_per_numa ;
+  const bool valid_threads   = thread_count &&
+                               thread_count <= use_numa_count * use_cores_per_numa * avail_threads_per_core ;
+  const bool balanced_numa   = ! ( thread_count % use_numa_count );
+  const bool balanced_cores  = ! ( thread_count % ( use_numa_count * use_cores_per_numa ) );
+
+  const bool valid_input = valid_numa && valid_cores && valid_threads && balanced_numa && balanced_cores ;
+
+  if ( ! valid_input ) {
+
+    std::ostringstream msg ;
+
+    msg << label << " HWLOC ERROR(s)" ;
+
+    if ( ! valid_threads ) {
+      msg << " : thread_count(" << thread_count
+          << ") exceeds capacity("
+          << use_numa_count * use_cores_per_numa * avail_threads_per_core
+          << ")" ;
+    }
+    if ( ! valid_numa ) {
+      msg << " : use_numa_count(" << use_numa_count
+          << ") exceeds capacity(" << avail_numa_count << ")" ;
+    }
+    if ( ! valid_cores ) {
+      msg << " : use_cores_per_numa(" << use_cores_per_numa
+          << ") exceeds capacity(" << avail_cores_per_numa << ")" ;
+    }
+    if ( ! balanced_numa ) {
+      msg << " : thread_count(" << thread_count
+          << ") imbalanced among numa(" << use_numa_count << ")" ;
+    }
+    if ( ! balanced_cores ) {
+      msg << " : thread_count(" << thread_count
+          << ") imbalanced among cores(" << use_numa_count * use_cores_per_numa << ")" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  const unsigned thread_spawn_synchronous =
+    ( allow_async &&
+      1 < thread_count &&
+      ( use_numa_count     < avail_numa_count ||
+        use_cores_per_numa < avail_cores_per_numa ) )
+     ? 0 /* asyncronous */
+     : 1 /* synchronous, threads_coord[0] is process core */ ;
+
+  // Determine binding coordinates for to-be-spawned threads so that
+  // threads may be bound to cores as they are spawned.
+
+  const unsigned threads_per_core = thread_count / ( use_numa_count * use_cores_per_numa );
+
+  if ( thread_spawn_synchronous ) {
+    // Working synchronously and include process core as threads_coord[0].
+    // Swap the NUMA coordinate of the process core with 0
+    // Swap the CORE coordinate of the process core with 0
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = 0 == inuma ? proc_coord.first : ( proc_coord.first == inuma ? 0 : inuma );
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = 0 == icore ? proc_coord.second : ( proc_coord.second == icore ? 0 : icore );
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+  else if ( use_numa_count < avail_numa_count ) {
+    // Working asynchronously and omit the process' NUMA region from the pool.
+    // Swap the NUMA coordinate of the process core with ( ( avail_numa_count - use_numa_count ) - 1 )
+    const unsigned numa_coord_swap = ( avail_numa_count - use_numa_count ) - 1 ;
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = proc_coord.first == inuma ? numa_coord_swap : inuma ;
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = icore ;
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+  else if ( use_cores_per_numa < avail_cores_per_numa ) {
+    // Working asynchronously and omit the process' core from the pool.
+    // Swap the CORE coordinate of the process core with ( ( avail_cores_per_numa - use_cores_per_numa ) - 1 )
+    const unsigned core_coord_swap = ( avail_cores_per_numa - use_cores_per_numa ) - 1 ;
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = inuma ;
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = proc_coord.second == icore ? core_coord_swap : icore ;
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+
+  return thread_spawn_synchronous ;
+}
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_ENABLE_HWLOC )
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+/*--------------------------------------------------------------------------*/
+/* Third Party Libraries */
+
+/* Hardware locality library: http://www.open-mpi.org/projects/hwloc/ */
+#include <hwloc.h>
+
+#define  REQUIRED_HWLOC_API_VERSION  0x000010300
+
+#if HWLOC_API_VERSION < REQUIRED_HWLOC_API_VERSION
+#error "Requires  http://www.open-mpi.org/projects/hwloc/  Version 1.3 or greater"
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace hwloc {
+namespace {
+
+#if DEBUG_PRINT
+
+inline
+void print_bitmap( std::ostream & s , const hwloc_const_bitmap_t bitmap )
+{
+  s << "{" ;
+  for ( int i = hwloc_bitmap_first( bitmap ) ;
+        -1 != i ; i = hwloc_bitmap_next( bitmap , i ) ) {
+    s << " " << i ;
+  }
+  s << " }" ;
+}
+
+#endif
+
+enum { MAX_CORE = 1024 };
+
+std::pair<unsigned,unsigned> s_core_topology(0,0);
+unsigned                     s_core_capacity(0);
+hwloc_topology_t             s_hwloc_topology(0);
+hwloc_bitmap_t               s_hwloc_location(0);
+hwloc_bitmap_t               s_process_binding(0);
+hwloc_bitmap_t               s_core[ MAX_CORE ];
+bool                         s_can_bind_threads(true);
+
+struct Sentinel {
+  ~Sentinel();
+  Sentinel();
+};
+
+bool sentinel()
+{
+  static Sentinel self ;
+
+  if ( 0 == s_hwloc_topology ) {
+    std::cerr << "Kokkos::hwloc ERROR : Called after return from main()" << std::endl ;
+    std::cerr.flush();
+  }
+
+  return 0 != s_hwloc_topology ;
+}
+
+Sentinel::~Sentinel()
+{
+  hwloc_topology_destroy( s_hwloc_topology );
+  hwloc_bitmap_free( s_process_binding );
+  hwloc_bitmap_free( s_hwloc_location );
+
+  s_core_topology.first  = 0 ;
+  s_core_topology.second = 0 ;
+  s_core_capacity   = 0 ;
+  s_hwloc_topology  = 0 ;
+  s_hwloc_location  = 0 ;
+  s_process_binding = 0 ;
+}
+
+Sentinel::Sentinel()
+{
+#if defined(__MIC__)
+  static const bool remove_core_0 = true ;
+#else
+  static const bool remove_core_0 = false ;
+#endif
+
+  s_core_topology   = std::pair<unsigned,unsigned>(0,0);
+  s_core_capacity   = 0 ;
+  s_hwloc_topology  = 0 ;
+  s_hwloc_location  = 0 ;
+  s_process_binding = 0 ;
+
+  for ( unsigned i = 0 ; i < MAX_CORE ; ++i ) s_core[i] = 0 ;
+
+  hwloc_topology_init( & s_hwloc_topology );
+  hwloc_topology_load( s_hwloc_topology );
+
+  s_hwloc_location  = hwloc_bitmap_alloc();
+  s_process_binding = hwloc_bitmap_alloc();
+
+  hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
+
+  if ( hwloc_bitmap_iszero( s_process_binding ) ) {
+    if (Kokkos::show_warnings() ) {
+      std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl;
+    }
+    const int pu_depth = hwloc_get_type_depth( s_hwloc_topology, HWLOC_OBJ_PU );
+    int num_pu = 1;
+    if ( pu_depth != HWLOC_TYPE_DEPTH_UNKNOWN ) {
+      num_pu = hwloc_get_nbobjs_by_depth( s_hwloc_topology, pu_depth );
+    }
+    else {
+      if (Kokkos::show_warnings() ) {
+        std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl;
+      }
+      num_pu = 1;
+    }
+    hwloc_bitmap_set_range( s_process_binding, 0, num_pu-1);
+    s_can_bind_threads = false;
+  }
+
+
+  if ( remove_core_0 ) {
+
+    const hwloc_obj_t core = hwloc_get_obj_by_type( s_hwloc_topology , HWLOC_OBJ_CORE , 0 );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+      hwloc_bitmap_t s_process_no_core_zero = hwloc_bitmap_alloc();
+
+      hwloc_bitmap_andnot( s_process_no_core_zero , s_process_binding , core->allowed_cpuset );
+
+      bool ok = 0 == hwloc_set_cpubind( s_hwloc_topology ,
+                                        s_process_no_core_zero ,
+                                        HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT );
+
+      if ( ok ) {
+        hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
+
+        ok = 0 != hwloc_bitmap_isequal( s_process_binding , s_process_no_core_zero );
+      }
+
+      hwloc_bitmap_free( s_process_no_core_zero );
+
+      if ( Kokkos::show_warnings() && ! ok ) {
+        std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ;
+      }
+    }
+  }
+
+  // Choose a hwloc object type for the NUMA level, which may not exist.
+
+  hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX ;
+
+  {
+    // Object types to search, in order.
+    static const hwloc_obj_type_t candidate_root_type[] =
+      { HWLOC_OBJ_NODE     /* NUMA region     */
+      , HWLOC_OBJ_SOCKET   /* hardware socket */
+      , HWLOC_OBJ_MACHINE  /* local machine   */
+      };
+
+    enum { CANDIDATE_ROOT_TYPE_COUNT =
+             sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) };
+
+    for ( int k = 0 ; k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type ; ++k ) {
+      if ( 0 < hwloc_get_nbobjs_by_type( s_hwloc_topology , candidate_root_type[k] ) ) {
+        root_type = candidate_root_type[k] ;
+      }
+    }
+  }
+
+  // Determine which of these 'root' types are available to this process.
+  // The process may have been bound (e.g., by MPI) to a subset of these root types.
+  // Determine current location of the master (calling) process>
+
+  hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc();
+
+  hwloc_get_last_cpu_location( s_hwloc_topology , proc_cpuset_location , HWLOC_CPUBIND_THREAD );
+
+  const unsigned max_root = hwloc_get_nbobjs_by_type( s_hwloc_topology , root_type );
+
+  unsigned root_base     = max_root ;
+  unsigned root_count    = 0 ;
+  unsigned core_per_root = 0 ;
+  unsigned pu_per_core   = 0 ;
+  bool     symmetric     = true ;
+
+  for ( unsigned i = 0 ; i < max_root ; ++i ) {
+
+    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , i );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
+
+      ++root_count ;
+
+      // Remember which root (NUMA) object the master thread is running on.
+      // This will be logical NUMA rank #0 for this process.
+
+      if ( hwloc_bitmap_intersects( proc_cpuset_location, root->allowed_cpuset ) ) {
+        root_base = i ;
+      }
+
+      // Count available cores:
+
+      const unsigned max_core =
+        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                root->allowed_cpuset ,
+                                                HWLOC_OBJ_CORE );
+
+      unsigned core_count = 0 ;
+
+      for ( unsigned j = 0 ; j < max_core ; ++j ) {
+
+        const hwloc_obj_t core =
+          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
+                                               root->allowed_cpuset ,
+                                               HWLOC_OBJ_CORE , j );
+
+        // If process' cpuset intersects core's cpuset then process can access this core.
+        // Must use intersection instead of inclusion because the Intel-Phi
+        // MPI may bind the process to only one of the core's hyperthreads.
+        //
+        // Assumption: if the process can access any hyperthread of the core
+        // then it has ownership of the entire core.
+        // This assumes that it would be performance-detrimental
+        // to spawn more than one MPI process per core and use nested threading.
+
+        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+          ++core_count ;
+
+          const unsigned pu_count =
+            hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                    core->allowed_cpuset ,
+                                                    HWLOC_OBJ_PU );
+
+          if ( pu_per_core == 0 ) pu_per_core = pu_count ;
+
+          // Enforce symmetry by taking the minimum:
+
+          pu_per_core = std::min( pu_per_core , pu_count );
+
+          if ( pu_count != pu_per_core ) symmetric = false ;
+        }
+      }
+
+      if ( 0 == core_per_root ) core_per_root = core_count ;
+
+      // Enforce symmetry by taking the minimum:
+
+      core_per_root = std::min( core_per_root , core_count );
+
+      if ( core_count != core_per_root ) symmetric = false ;
+    }
+  }
+
+  s_core_topology.first  = root_count ;
+  s_core_topology.second = core_per_root ;
+  s_core_capacity        = pu_per_core ;
+
+  // Fill the 's_core' array for fast mapping from a core coordinate to the
+  // hwloc cpuset object required for thread location querying and binding.
+
+  for ( unsigned i = 0 ; i < max_root ; ++i ) {
+
+    const unsigned root_rank = ( i + root_base ) % max_root ;
+
+    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , root_rank );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
+
+      const unsigned max_core =
+        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                root->allowed_cpuset ,
+                                                HWLOC_OBJ_CORE );
+
+      unsigned core_count = 0 ;
+
+      for ( unsigned j = 0 ; j < max_core && core_count < core_per_root ; ++j ) {
+
+        const hwloc_obj_t core =
+          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
+                                               root->allowed_cpuset ,
+                                               HWLOC_OBJ_CORE , j );
+
+        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+          s_core[ core_count + core_per_root * i ] = core->allowed_cpuset ;
+
+          ++core_count ;
+        }
+      }
+    }
+  }
+
+  hwloc_bitmap_free( proc_cpuset_location );
+
+  if ( Kokkos::show_warnings() && ! symmetric ) {
+    std::cerr << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
+              << std::endl ;
+  }
+}
+
+
+} // namespace
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+bool available()
+{ return true ; }
+
+unsigned get_available_numa_count()
+{ sentinel(); return s_core_topology.first ; }
+
+unsigned get_available_cores_per_numa()
+{ sentinel(); return s_core_topology.second ; }
+
+unsigned get_available_threads_per_core()
+{ sentinel(); return s_core_capacity ; }
+
+bool can_bind_threads()
+{ sentinel(); return s_can_bind_threads; }
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+unsigned bind_this_thread(
+  const unsigned               coordinate_count ,
+  std::pair<unsigned,unsigned> coordinate[] )
+{
+  unsigned i = 0 ;
+
+  try {
+    const std::pair<unsigned,unsigned> current = get_this_thread_coordinate();
+
+    // Match one of the requests:
+    for ( i = 0 ; i < coordinate_count && current != coordinate[i] ; ++i );
+
+    if ( coordinate_count == i ) {
+      // Match the first request (typically NUMA):
+      for ( i = 0 ; i < coordinate_count && current.first != coordinate[i].first ; ++i );
+    }
+
+    if ( coordinate_count == i ) {
+      // Match any unclaimed request:
+      for ( i = 0 ; i < coordinate_count && ~0u == coordinate[i].first  ; ++i );
+    }
+
+    if ( coordinate_count == i || ! bind_this_thread( coordinate[i] ) ) {
+       // Failed to bind:
+       i = ~0u ;
+    }
+
+    if ( i < coordinate_count ) {
+
+#if DEBUG_PRINT
+      if ( current != coordinate[i] ) {
+        std::cout << "  bind_this_thread: rebinding from ("
+                  << current.first << ","
+                  << current.second
+                  << ") to ("
+                  << coordinate[i].first << ","
+                  << coordinate[i].second
+                  << ")" << std::endl ;
+      }
+#endif
+
+      coordinate[i].first  = ~0u ;
+      coordinate[i].second = ~0u ;
+    }
+  }
+  catch( ... ) {
+    i = ~0u ;
+  }
+
+  return i ;
+}
+
+
+bool bind_this_thread( const std::pair<unsigned,unsigned> coord )
+{
+  if ( ! sentinel() ) return false ;
+
+#if DEBUG_PRINT
+
+  std::cout << "Kokkos::bind_this_thread() at " ;
+
+  hwloc_get_last_cpu_location( s_hwloc_topology ,
+                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+  std::cout << " to " ;
+
+  print_bitmap( std::cout , s_core[ coord.second + coord.first * s_core_topology.second ] );
+
+  std::cout << std::endl ;
+
+#endif
+
+  // As safe and fast as possible.
+  // Fast-lookup by caching the coordinate -> hwloc cpuset mapping in 's_core'.
+  return coord.first  < s_core_topology.first &&
+         coord.second < s_core_topology.second &&
+         0 == hwloc_set_cpubind( s_hwloc_topology ,
+                                 s_core[ coord.second + coord.first * s_core_topology.second ] ,
+                                 HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
+}
+
+bool unbind_this_thread()
+{
+  if ( ! sentinel() ) return false ;
+
+#define HWLOC_DEBUG_PRINT 0
+
+#if HWLOC_DEBUG_PRINT
+
+  std::cout << "Kokkos::unbind_this_thread() from " ;
+
+  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+#endif
+
+  const bool result =
+    s_hwloc_topology &&
+    0 == hwloc_set_cpubind( s_hwloc_topology ,
+                            s_process_binding ,
+                            HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
+
+#if HWLOC_DEBUG_PRINT
+
+  std::cout << " to " ;
+
+  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+  std::cout << std::endl ;
+
+#endif
+
+  return result ;
+
+#undef HWLOC_DEBUG_PRINT
+
+}
+
+//----------------------------------------------------------------------------
+
+std::pair<unsigned,unsigned> get_this_thread_coordinate()
+{
+  std::pair<unsigned,unsigned> coord(0u,0u);
+
+  if ( ! sentinel() ) return coord ;
+
+  const unsigned n = s_core_topology.first * s_core_topology.second ;
+
+  // Using the pre-allocated 's_hwloc_location' to avoid memory
+  // allocation by this thread.  This call is NOT thread-safe.
+  hwloc_get_last_cpu_location( s_hwloc_topology ,
+                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  unsigned i = 0 ;
+
+  while ( i < n && ! hwloc_bitmap_intersects( s_hwloc_location , s_core[ i ] ) ) ++i ;
+
+  if ( i < n ) {
+    coord.first  = i / s_core_topology.second ;
+    coord.second = i % s_core_topology.second ;
+  }
+
+  return coord ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#else /* ! defined( KOKKOS_ENABLE_HWLOC ) */
+
+namespace Kokkos {
+namespace hwloc {
+
+bool available() { return false ; }
+bool can_bind_threads() { return false ; }
+
+unsigned get_available_numa_count() { return 1 ; }
+unsigned get_available_cores_per_numa() { return 1 ; }
+unsigned get_available_threads_per_core() { return 1 ; }
+
+unsigned bind_this_thread( const unsigned , std::pair<unsigned,unsigned>[] )
+{ return ~0 ; }
+
+bool bind_this_thread( const std::pair<unsigned,unsigned> )
+{ return false ; }
+
+bool unbind_this_thread()
+{ return true ; }
+
+std::pair<unsigned,unsigned> get_this_thread_coordinate()
+{ return std::pair<unsigned,unsigned>(0,0); }
+
+} // namespace hwloc
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
diff --git a/packages/kokkos/core/unit_test/CMakeLists.txt b/packages/kokkos/core/unit_test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..74033e27db537d2d6a3ead60cc8393af23f73093
--- /dev/null
+++ b/packages/kokkos/core/unit_test/CMakeLists.txt
@@ -0,0 +1,354 @@
+#
+# Add test-only library for gtest to be reused by all the subpackages
+#
+
+IF(NOT KOKKOS_HAS_TRILINOS)
+  IF(KOKKOS_SEPARATE_LIBS)
+    set(TEST_LINK_TARGETS kokkoscore)
+  ELSE()
+    set(TEST_LINK_TARGETS kokkos)
+  ENDIF()
+ENDIF()
+
+SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest)
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGTEST_HAS_PTHREAD=0")
+
+INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR})
+TRIBITS_ADD_LIBRARY(
+  kokkos_gtest
+  HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h
+  SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc
+  TESTONLY
+  )
+
+#
+# Define the tests
+#
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
+
+IF(Kokkos_ENABLE_Serial)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_Serial
+    SOURCES
+      UnitTestMainInit.cpp
+      serial/TestSerial_AtomicOperations.cpp
+      serial/TestSerial_AtomicViews.cpp
+      serial/TestSerial_Atomics.cpp
+      serial/TestSerial_Complex.cpp
+      serial/TestSerial_Init.cpp
+      serial/TestSerial_MDRange.cpp
+      serial/TestSerial_Other.cpp
+      serial/TestSerial_RangePolicy.cpp
+      serial/TestSerial_Reductions.cpp
+      serial/TestSerial_Scan.cpp
+      serial/TestSerial_SharedAlloc.cpp
+      serial/TestSerial_SubView_a.cpp
+      serial/TestSerial_SubView_b.cpp
+      serial/TestSerial_SubView_c01.cpp
+      serial/TestSerial_SubView_c02.cpp
+      serial/TestSerial_SubView_c03.cpp
+      serial/TestSerial_SubView_c04.cpp
+      serial/TestSerial_SubView_c05.cpp
+      serial/TestSerial_SubView_c06.cpp
+      serial/TestSerial_SubView_c07.cpp
+      serial/TestSerial_SubView_c08.cpp
+      serial/TestSerial_SubView_c09.cpp
+      serial/TestSerial_SubView_c10.cpp
+      serial/TestSerial_SubView_c11.cpp
+      serial/TestSerial_SubView_c12.cpp
+      serial/TestSerial_SubView_c13.cpp
+      serial/TestSerial_Team.cpp
+      serial/TestSerial_TeamReductionScan.cpp
+      serial/TestSerial_TeamScratch.cpp
+      serial/TestSerial_ViewAPI_b.cpp
+      serial/TestSerial_ViewMapping_a.cpp
+      serial/TestSerial_ViewMapping_b.cpp
+      serial/TestSerial_ViewMapping_subview.cpp
+      serial/TestSerial_ViewOfClass.cpp
+      serial/TestSerial_Crs.cpp
+      serial/TestSerial_WorkGraph.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Pthread)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_Threads
+    SOURCES
+      UnitTestMainInit.cpp
+      threads/TestThreads_AtomicOperations.cpp
+      threads/TestThreads_AtomicViews.cpp
+      threads/TestThreads_Atomics.cpp
+      threads/TestThreads_Complex.cpp
+      threads/TestThreads_Init.cpp
+      threads/TestThreads_MDRange.cpp
+      threads/TestThreads_Other.cpp
+      threads/TestThreads_RangePolicy.cpp
+      threads/TestThreads_Reductions.cpp
+      threads/TestThreads_Scan.cpp
+      threads/TestThreads_SharedAlloc.cpp
+      threads/TestThreads_SubView_a.cpp
+      threads/TestThreads_SubView_b.cpp
+      threads/TestThreads_SubView_c01.cpp
+      threads/TestThreads_SubView_c02.cpp
+      threads/TestThreads_SubView_c03.cpp
+      threads/TestThreads_SubView_c04.cpp
+      threads/TestThreads_SubView_c05.cpp
+      threads/TestThreads_SubView_c06.cpp
+      threads/TestThreads_SubView_c07.cpp
+      threads/TestThreads_SubView_c08.cpp
+      threads/TestThreads_SubView_c09.cpp
+      threads/TestThreads_SubView_c10.cpp
+      threads/TestThreads_SubView_c11.cpp
+      threads/TestThreads_SubView_c12.cpp
+      threads/TestThreads_SubView_c13.cpp
+      threads/TestThreads_Team.cpp
+      threads/TestThreads_TeamReductionScan.cpp
+      threads/TestThreads_TeamScratch.cpp
+      threads/TestThreads_ViewAPI_b.cpp
+      threads/TestThreads_ViewMapping_a.cpp
+      threads/TestThreads_ViewMapping_b.cpp
+      threads/TestThreads_ViewMapping_subview.cpp
+      threads/TestThreads_ViewOfClass.cpp
+      threads/TestThreads_Crs.cpp
+      threads/TestThreads_WorkGraph.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_OpenMP)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_OpenMP
+    SOURCES
+      UnitTestMainInit.cpp
+      openmp/TestOpenMP_AtomicOperations.cpp
+      openmp/TestOpenMP_AtomicViews.cpp
+      openmp/TestOpenMP_Atomics.cpp
+      openmp/TestOpenMP_Complex.cpp
+      openmp/TestOpenMP_Init.cpp
+      openmp/TestOpenMP_MDRange.cpp
+      openmp/TestOpenMP_Other.cpp
+      openmp/TestOpenMP_RangePolicy.cpp
+      openmp/TestOpenMP_Reductions.cpp
+      openmp/TestOpenMP_Scan.cpp
+      openmp/TestOpenMP_SharedAlloc.cpp
+      openmp/TestOpenMP_SubView_a.cpp
+      openmp/TestOpenMP_SubView_b.cpp
+      openmp/TestOpenMP_SubView_c01.cpp
+      openmp/TestOpenMP_SubView_c02.cpp
+      openmp/TestOpenMP_SubView_c03.cpp
+      openmp/TestOpenMP_SubView_c04.cpp
+      openmp/TestOpenMP_SubView_c05.cpp
+      openmp/TestOpenMP_SubView_c06.cpp
+      openmp/TestOpenMP_SubView_c07.cpp
+      openmp/TestOpenMP_SubView_c08.cpp
+      openmp/TestOpenMP_SubView_c09.cpp
+      openmp/TestOpenMP_SubView_c10.cpp
+      openmp/TestOpenMP_SubView_c11.cpp
+      openmp/TestOpenMP_SubView_c12.cpp
+      openmp/TestOpenMP_SubView_c13.cpp
+      openmp/TestOpenMP_Task.cpp
+      openmp/TestOpenMP_Team.cpp
+      openmp/TestOpenMP_TeamReductionScan.cpp
+      openmp/TestOpenMP_ViewAPI_b.cpp
+      openmp/TestOpenMP_ViewMapping_a.cpp
+      openmp/TestOpenMP_ViewMapping_b.cpp
+      openmp/TestOpenMP_ViewMapping_subview.cpp
+      openmp/TestOpenMP_ViewOfClass.cpp
+      openmp/TestOpenMP_Crs.cpp
+      openmp/TestOpenMP_WorkGraph.cpp
+      openmp/TestOpenMP_UniqueToken.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_OpenMPInterOp
+    SOURCES
+      UnitTestMain.cpp
+      openmp/TestOpenMP_InterOp.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Qthreads)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_Qthreads
+    SOURCES
+      UnitTestMainInit.cpp
+      qthreads/TestQthreads_Atomics.cpp
+      qthreads/TestQthreads_Complex.cpp
+      qthreads/TestQthreads_Other.cpp
+      qthreads/TestQthreads_Reductions.cpp
+      qthreads/TestQthreads_SubView_a.cpp
+      qthreads/TestQthreads_SubView_b.cpp
+      qthreads/TestQthreads_SubView_c01.cpp
+      qthreads/TestQthreads_SubView_c02.cpp
+      qthreads/TestQthreads_SubView_c03.cpp
+      qthreads/TestQthreads_SubView_c04.cpp
+      qthreads/TestQthreads_SubView_c05.cpp
+      qthreads/TestQthreads_SubView_c06.cpp
+      qthreads/TestQthreads_SubView_c07.cpp
+      qthreads/TestQthreads_SubView_c08.cpp
+      qthreads/TestQthreads_SubView_c09.cpp
+      qthreads/TestQthreads_SubView_c10.cpp
+      qthreads/TestQthreads_SubView_c11.cpp
+      qthreads/TestQthreads_SubView_c12.cpp
+      qthreads/TestQthreads_SubView_c13.cpp
+      qthreads/TestQthreads_Team.cpp
+      qthreads/TestQthreads_ViewAPI_a.cpp
+      qthreads/TestQthreads_ViewAPI_b.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Cuda)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_Cuda
+    SOURCES
+      UnitTestMainInit.cpp
+      cuda/TestCudaHostPinned_SharedAlloc.cpp
+      cuda/TestCudaHostPinned_ViewAPI.cpp
+      cuda/TestCudaHostPinned_ViewMapping_a.cpp
+      cuda/TestCudaHostPinned_ViewMapping_b.cpp
+      cuda/TestCudaHostPinned_ViewMapping_subview.cpp
+      cuda/TestCudaUVM_SharedAlloc.cpp
+      cuda/TestCudaUVM_ViewAPI.cpp
+      cuda/TestCudaUVM_ViewMapping_a.cpp
+      cuda/TestCudaUVM_ViewMapping_b.cpp
+      cuda/TestCudaUVM_ViewMapping_subview.cpp
+      cuda/TestCuda_AtomicOperations.cpp
+      cuda/TestCuda_AtomicViews.cpp
+      cuda/TestCuda_Atomics.cpp
+      cuda/TestCuda_Complex.cpp
+      cuda/TestCuda_Init.cpp
+      cuda/TestCuda_MDRange.cpp
+      cuda/TestCuda_Other.cpp
+      cuda/TestCuda_RangePolicy.cpp
+      cuda/TestCuda_Reductions.cpp
+      cuda/TestCuda_Scan.cpp
+      cuda/TestCuda_SharedAlloc.cpp
+      cuda/TestCuda_Spaces.cpp
+      cuda/TestCuda_SubView_a.cpp
+      cuda/TestCuda_SubView_b.cpp
+      cuda/TestCuda_SubView_c01.cpp
+      cuda/TestCuda_SubView_c02.cpp
+      cuda/TestCuda_SubView_c03.cpp
+      cuda/TestCuda_SubView_c04.cpp
+      cuda/TestCuda_SubView_c05.cpp
+      cuda/TestCuda_SubView_c06.cpp
+      cuda/TestCuda_SubView_c07.cpp
+      cuda/TestCuda_SubView_c08.cpp
+      cuda/TestCuda_SubView_c09.cpp
+      cuda/TestCuda_SubView_c10.cpp
+      cuda/TestCuda_SubView_c11.cpp
+      cuda/TestCuda_SubView_c12.cpp
+      cuda/TestCuda_SubView_c13.cpp
+      cuda/TestCuda_Task.cpp
+      cuda/TestCuda_Team.cpp
+      cuda/TestCuda_TeamReductionScan.cpp
+      cuda/TestCuda_TeamScratch.cpp
+      cuda/TestCuda_ViewAPI_b.cpp
+      cuda/TestCuda_ViewMapping_a.cpp
+      cuda/TestCuda_ViewMapping_b.cpp
+      cuda/TestCuda_ViewMapping_subview.cpp
+      cuda/TestCuda_ViewOfClass.cpp
+      cuda/TestCuda_Crs.cpp
+      cuda/TestCuda_WorkGraph.cpp
+      cuda/TestCuda_UniqueToken.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_CudaInterOp
+    SOURCES
+      UnitTestMain.cpp
+      cuda/TestCuda_InterOp.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+  )
+ENDIF()
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_Default
+  SOURCES
+    UnitTestMainInit.cpp
+    default/TestDefaultDeviceType.cpp
+    default/TestDefaultDeviceType_a.cpp
+    default/TestDefaultDeviceType_b.cpp
+    default/TestDefaultDeviceType_c.cpp
+    default/TestDefaultDeviceType_d.cpp
+    default/TestDefaultDeviceTypeResize.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+)
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_PushFinalizeHook
+  SOURCES
+    UnitTest_PushFinalizeHook.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "FAILED"
+    TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+)
+
+# This test is special, because it passes exactly when it prints the
+# message "PASSED: I am the custom std::terminate handler.", AND calls
+# std::terminate.  This means that we can't use
+# TRIBITS_ADD_EXECUTABLE_AND_TEST.  See GitHub issue #2147.
+
+TRIBITS_ADD_EXECUTABLE( push_finalize_hook_terminate
+  SOURCES UnitTest_PushFinalizeHook_terminate.cpp
+  TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+)
+
+TRIBITS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
+  TEST_0
+    EXEC push_finalize_hook_terminate
+    NUM_MPI_PROCS 1
+    PASS_REGULAR_EXPRESSION
+      "PASSED: I am the custom std::terminate handler."
+    ALWAYS_FAIL_ON_ZERO_RETURN
+)
+
+foreach(INITTESTS_NUM RANGE 1 16)
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_DefaultInit_${INITTESTS_NUM}
+  SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+)
+endforeach(INITTESTS_NUM)
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_HWLOC
+  SOURCES UnitTestMain.cpp  TestHWLOC.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS}
+)
diff --git a/packages/kokkos/core/unit_test/Makefile b/packages/kokkos/core/unit_test/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..42d604548e2f06de561a22f99ac6fc11ab7dcf95
--- /dev/null
+++ b/packages/kokkos/core/unit_test/Makefile
@@ -0,0 +1,398 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../tpls/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test/default
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test/serial
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test/threads
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test/openmp
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test/openmptarget
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test/qthreads
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test/cuda
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test/rocm
+
+
+TEST_HEADERS = $(wildcard $(KOKKOS_PATH)/core/unit_test/*.hpp)
+TEST_HEADERS += $(wildcard $(KOKKOS_PATH)/core/unit_test/*/*.hpp)
+
+default: build_all
+	echo "End Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+else
+  CXX = g++
+endif
+
+CXXFLAGS = -O3
+LINK ?= $(CXX)
+LDFLAGS ?=
+override LDFLAGS += -lpthread
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/unit_test
+
+TEST_TARGETS =
+TARGETS =
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = UnitTestMainInit.o gtest-all.o
+	OBJ_CUDA += TestCuda_Init.o
+	OBJ_CUDA += TestCuda_SharedAlloc.o TestCudaUVM_SharedAlloc.o TestCudaHostPinned_SharedAlloc.o
+	OBJ_CUDA += TestCuda_RangePolicy.o
+	OBJ_CUDA += TestCuda_ViewAPI_b.o
+	OBJ_CUDA += TestCuda_ViewMapping_a.o TestCuda_ViewMapping_b.o TestCuda_ViewMapping_subview.o
+	OBJ_CUDA += TestCudaUVM_ViewAPI.o
+	OBJ_CUDA += TestCudaUVM_ViewMapping_a.o TestCudaUVM_ViewMapping_b.o TestCudaUVM_ViewMapping_subview.o
+	OBJ_CUDA += TestCudaHostPinned_ViewAPI.o
+	OBJ_CUDA += TestCudaHostPinned_ViewMapping_a.o TestCudaHostPinned_ViewMapping_b.o TestCudaHostPinned_ViewMapping_subview.o
+	OBJ_CUDA += TestCuda_ViewOfClass.o
+	OBJ_CUDA += TestCuda_SubView_a.o TestCuda_SubView_b.o
+	OBJ_CUDA += TestCuda_SubView_c01.o TestCuda_SubView_c02.o TestCuda_SubView_c03.o
+	OBJ_CUDA += TestCuda_SubView_c04.o TestCuda_SubView_c05.o TestCuda_SubView_c06.o
+	OBJ_CUDA += TestCuda_SubView_c07.o TestCuda_SubView_c08.o TestCuda_SubView_c09.o
+	OBJ_CUDA += TestCuda_SubView_c10.o TestCuda_SubView_c11.o TestCuda_SubView_c12.o
+	OBJ_CUDA += TestCuda_SubView_c13.o
+	OBJ_CUDA += TestCuda_Reductions.o TestCuda_Scan.o
+	OBJ_CUDA += TestCuda_Complex.o
+	OBJ_CUDA += TestCuda_AtomicOperations.o TestCuda_AtomicViews.o TestCuda_Atomics.o
+	OBJ_CUDA += TestCuda_Team.o TestCuda_TeamScratch.o
+	OBJ_CUDA += TestCuda_TeamReductionScan.o
+	OBJ_CUDA += TestCuda_Other.o
+	OBJ_CUDA += TestCuda_MDRange.o
+	OBJ_CUDA += TestCuda_Crs.o
+	OBJ_CUDA += TestCuda_Task.o TestCuda_WorkGraph.o
+	OBJ_CUDA += TestCuda_Spaces.o
+	OBJ_CUDA += TestCuda_UniqueToken.o
+	
+	TARGETS += KokkosCore_UnitTest_Cuda
+    TARGETS += KokkosCore_UnitTest_CudaInterOp
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+        OBJ_ROCM = UnitTestMainInit.o gtest-all.o
+        OBJ_ROCM += TestROCm_Init.o
+        OBJ_ROCM += TestROCm_Complex.o
+        OBJ_ROCM += TestROCm_RangePolicy.o
+        OBJ_ROCM += TestROCm_AtomicOperations.o
+        OBJ_ROCM += TestROCm_Atomics.o
+# complex failing
+        OBJ_ROCM += TestROCm_AtomicViews.o
+        OBJ_ROCM += TestROCm_Other.o
+#        OBJ_ROCM += TestROCm_MDRange.o
+# rocm.memory_pool
+        OBJ_ROCM += TestROCm_Scan.o
+        OBJ_ROCM += TestROCm_SharedAlloc.o
+        OBJ_ROCM += TestROCm_SubView_a.o
+        OBJ_ROCM += TestROCm_SubView_b.o
+# relies on host accessable device memory
+#        OBJ_ROCM += TestROCm_SubView_c01.o
+#        OBJ_ROCM += TestROCm_SubView_c02.o
+#        OBJ_ROCM += TestROCm_SubView_c03.o
+#        OBJ_ROCM += TestROCm_SubView_c04.o
+#        OBJ_ROCM += TestROCm_SubView_c05.o
+#        OBJ_ROCM += TestROCm_SubView_c06.o
+#        OBJ_ROCM += TestROCm_SubView_c07.o
+#        OBJ_ROCM += TestROCm_SubView_c08.o
+#        OBJ_ROCM += TestROCm_SubView_c09.o
+#        OBJ_ROCM += TestROCm_SubView_c10.o
+#        OBJ_ROCM += TestROCm_SubView_c11.o
+#        OBJ_ROCM += TestROCm_SubView_c12.o
+# all of the above use UVM or Host accessable memory
+#        OBJ_ROCM += TestROCm_Team.o
+# compile fails
+#        OBJ_ROCM += TestROCm_TeamReductionScan.o
+# compile fails
+#        OBJ_ROCM += TestROCm_TeamScratch.o
+        OBJ_ROCM += TestROCm_ViewAPI_b.o
+        OBJ_ROCM += TestROCm_ViewMapping_a.o
+        OBJ_ROCM += TestROCm_ViewMapping_b.o
+        OBJ_ROCM += TestROCm_ViewMapping_subview.o
+	OBJ_ROCM += TestROCmHostPinned_ViewAPI.o
+	OBJ_ROCM += TestROCmHostPinned_ViewMapping_a.o 
+	OBJ_ROCM += TestROCmHostPinned_ViewMapping_b.o 
+	OBJ_ROCM += TestROCmHostPinned_ViewMapping_subview.o
+        OBJ_ROCM += TestROCm_ViewOfClass.o
+	OBJ_ROCM += TestROCm_Spaces.o
+     
+        TARGETS += KokkosCore_UnitTest_ROCm
+        TEST_TARGETS += test-rocm
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+
+	OBJ_THREADS = UnitTestMainInit.o gtest-all.o
+	OBJ_THREADS += TestThreads_Init.o
+	OBJ_THREADS += TestThreads_SharedAlloc.o
+	OBJ_THREADS += TestThreads_RangePolicy.o
+	OBJ_THREADS += TestThreads_ViewAPI_b.o
+	OBJ_THREADS += TestThreads_ViewMapping_a.o TestThreads_ViewMapping_b.o TestThreads_ViewMapping_subview.o
+	OBJ_THREADS += TestThreads_ViewOfClass.o
+	OBJ_THREADS += TestThreads_SubView_a.o TestThreads_SubView_b.o
+	OBJ_THREADS += TestThreads_SubView_c01.o TestThreads_SubView_c02.o TestThreads_SubView_c03.o
+	OBJ_THREADS += TestThreads_SubView_c04.o TestThreads_SubView_c05.o TestThreads_SubView_c06.o
+	OBJ_THREADS += TestThreads_SubView_c07.o TestThreads_SubView_c08.o TestThreads_SubView_c09.o
+	OBJ_THREADS += TestThreads_SubView_c10.o TestThreads_SubView_c11.o TestThreads_SubView_c12.o
+	OBJ_THREADS += TestThreads_Reductions.o TestThreads_Scan.o
+	OBJ_THREADS += TestThreads_Complex.o
+	OBJ_THREADS += TestThreads_AtomicOperations.o TestThreads_AtomicViews.o TestThreads_Atomics.o
+	OBJ_THREADS += TestThreads_Team.o TestThreads_TeamScratch.o
+	OBJ_THREADS += TestThreads_TeamReductionScan.o
+	OBJ_THREADS += TestThreads_Other.o
+	OBJ_THREADS += TestThreads_MDRange.o
+
+	TARGETS += KokkosCore_UnitTest_Threads
+
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = UnitTestMainInit.o gtest-all.o
+	OBJ_OPENMP += TestOpenMP_Init.o
+	OBJ_OPENMP += TestOpenMP_SharedAlloc.o
+	OBJ_OPENMP += TestOpenMP_RangePolicy.o
+	OBJ_OPENMP += TestOpenMP_ViewAPI_b.o
+	OBJ_OPENMP += TestOpenMP_ViewMapping_a.o TestOpenMP_ViewMapping_b.o TestOpenMP_ViewMapping_subview.o
+	OBJ_OPENMP += TestOpenMP_ViewOfClass.o
+	OBJ_OPENMP += TestOpenMP_SubView_a.o TestOpenMP_SubView_b.o
+	OBJ_OPENMP += TestOpenMP_SubView_c01.o TestOpenMP_SubView_c02.o TestOpenMP_SubView_c03.o
+	OBJ_OPENMP += TestOpenMP_SubView_c04.o TestOpenMP_SubView_c05.o TestOpenMP_SubView_c06.o
+	OBJ_OPENMP += TestOpenMP_SubView_c07.o TestOpenMP_SubView_c08.o TestOpenMP_SubView_c09.o
+	OBJ_OPENMP += TestOpenMP_SubView_c10.o TestOpenMP_SubView_c11.o TestOpenMP_SubView_c12.o
+	OBJ_OPENMP += TestOpenMP_SubView_c13.o
+	OBJ_OPENMP += TestOpenMP_Reductions.o TestOpenMP_Scan.o
+	OBJ_OPENMP += TestOpenMP_Complex.o
+	OBJ_OPENMP += TestOpenMP_AtomicOperations.o TestOpenMP_AtomicViews.o TestOpenMP_Atomics.o
+	OBJ_OPENMP += TestOpenMP_Team.o TestOpenMP_TeamScratch.o
+	OBJ_OPENMP += TestOpenMP_TeamReductionScan.o
+	OBJ_OPENMP += TestOpenMP_Other.o
+	OBJ_OPENMP += TestOpenMP_MDRange.o
+	OBJ_OPENMP += TestOpenMP_Crs.o
+	OBJ_OPENMP += TestOpenMP_Task.o TestOpenMP_WorkGraph.o
+	OBJ_OPENMP += TestOpenMP_UniqueToken.o
+	
+	TARGETS += KokkosCore_UnitTest_OpenMP
+    TARGETS += KokkosCore_UnitTest_OpenMPInterOp
+
+	TEST_TARGETS += test-openmp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+	OBJ_OPENMPTARGET = UnitTestMainInit.o gtest-all.o
+	OBJ_OPENMPTARGET += TestOpenMPTarget_Init.o
+	#OBJ_OPENMPTARGET += TestOpenMPTarget_SharedAlloc.o
+	OBJ_OPENMPTARGET += TestOpenMPTarget_RangePolicy.o
+	OBJ_OPENMPTARGET += TestOpenMPTarget_ViewAPI_b.o #Some commented out code
+	OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_a.o 
+        OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_b.o 
+        OBJ_OPENMPTARGET += TestOpenMPTarget_ViewMapping_subview.o
+	#OBJ_OPENMPTARGET += TestOpenMPTarget_ViewOfClass.o
+	OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_a.o TestOpenMPTarget_SubView_b.o
+	#The following subview tests need something like UVM:
+        #OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_c01.o TestOpenMPTarget_SubView_c02.o TestOpenMPTarget_SubView_c03.o
+	#OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_c04.o TestOpenMPTarget_SubView_c05.o TestOpenMPTarget_SubView_c06.o
+	#OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_c07.o TestOpenMPTarget_SubView_c08.o TestOpenMPTarget_SubView_c09.o
+	#OBJ_OPENMPTARGET += TestOpenMPTarget_SubView_c10.o TestOpenMPTarget_SubView_c11.o TestOpenMPTarget_SubView_c12.o
+	#OBJ_OPENMPTARGET += TestOpenMPTarget_Reductions.o # Need custom reductions
+        #OBJ_OPENMPTARGET += TestOpenMPTarget_Scan.o
+	OBJ_OPENMPTARGET += TestOpenMPTarget_Complex.o
+	OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicOperations.o 
+        OBJ_OPENMPTARGET += TestOpenMPTarget_AtomicViews.o 
+        OBJ_OPENMPTARGET += TestOpenMPTarget_Atomics.o # Commented Out Arbitrary Type Atomics
+	#OBJ_OPENMPTARGET += TestOpenMPTarget_Team.o # There is still a static function in this
+        #OBJ_OPENMPTARGET += TestOpenMPTarget_TeamScratch.o
+	#OBJ_OPENMPTARGET += TestOpenMPTarget_TeamReductionScan.o
+	#OBJ_OPENMPTARGET += TestOpenMPTarget_Other.o
+	#OBJ_OPENMPTARGET += TestOpenMPTarget_MDRange.o
+	#OBJ_OPENMPTARGET += TestOpenMPTarget_Task.o
+	
+	TARGETS += KokkosCore_UnitTest_OpenMPTarget
+
+	TEST_TARGETS += test-openmptarget
+
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+	OBJ_QTHREADS = TestQthreads_Other.o TestQthreads_Reductions.o TestQthreads_Atomics.o TestQthreads_Team.o
+	OBJ_QTHREADS += TestQthreads_SubView_a.o TestQthreads_SubView_b.o
+	OBJ_QTHREADS += TestQthreads_SubView_c01.o TestQthreads_SubView_c02.o TestQthreads_SubView_c03.o
+	OBJ_QTHREADS += TestQthreads_SubView_c04.o TestQthreads_SubView_c05.o TestQthreads_SubView_c06.o
+	OBJ_QTHREADS += TestQthreads_SubView_c07.o TestQthreads_SubView_c08.o TestQthreads_SubView_c09.o
+	OBJ_QTHREADS += TestQthreads_SubView_c10.o TestQthreads_SubView_c11.o TestQthreads_SubView_c12.o
+	OBJ_QTHREADS += TestQthreads_ViewAPI_a.o TestQthreads_ViewAPI_b.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Qthreads
+
+	OBJ_QTHREADS2 = UnitTestMainInit.o gtest-all.o
+	OBJ_QTHREADS2 += TestQthreads_Complex.o
+	TARGETS += KokkosCore_UnitTest_Qthreads2
+
+	TEST_TARGETS += test-qthreads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+        OBJ_SERIAL = UnitTestMainInit.o gtest-all.o
+        OBJ_SERIAL += TestSerial_Init.o
+        OBJ_SERIAL += TestSerial_SharedAlloc.o
+        OBJ_SERIAL += TestSerial_RangePolicy.o
+        OBJ_SERIAL += TestSerial_ViewAPI_b.o
+        OBJ_SERIAL += TestSerial_ViewMapping_a.o TestSerial_ViewMapping_b.o TestSerial_ViewMapping_subview.o
+        OBJ_SERIAL += TestSerial_ViewOfClass.o
+        OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o
+        OBJ_SERIAL += TestSerial_SubView_c01.o TestSerial_SubView_c02.o TestSerial_SubView_c03.o
+        OBJ_SERIAL += TestSerial_SubView_c04.o TestSerial_SubView_c05.o TestSerial_SubView_c06.o
+        OBJ_SERIAL += TestSerial_SubView_c07.o TestSerial_SubView_c08.o TestSerial_SubView_c09.o
+        OBJ_SERIAL += TestSerial_SubView_c10.o TestSerial_SubView_c11.o TestSerial_SubView_c12.o
+        OBJ_SERIAL += TestSerial_SubView_c13.o
+        OBJ_SERIAL += TestSerial_Reductions.o TestSerial_Scan.o
+        OBJ_SERIAL += TestSerial_Complex.o
+        OBJ_SERIAL += TestSerial_AtomicOperations.o TestSerial_AtomicViews.o TestSerial_Atomics.o
+        OBJ_SERIAL += TestSerial_Team.o TestSerial_TeamScratch.o
+        OBJ_SERIAL += TestSerial_TeamReductionScan.o
+        OBJ_SERIAL += TestSerial_Other.o
+        #HCC_WORKAROUND
+        ifneq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
+        OBJ_SERIAL += TestSerial_MDRange.o
+        endif
+        OBJ_SERIAL += TestSerial_Crs.o
+        OBJ_SERIAL += TestSerial_Task.o TestSerial_WorkGraph.o
+	
+	TARGETS += KokkosCore_UnitTest_Serial
+
+	TEST_TARGETS += test-serial
+endif
+
+OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o
+TARGETS += KokkosCore_UnitTest_HWLOC
+TEST_TARGETS += test-hwloc
+
+OBJ_DEFAULT = UnitTestMainInit.o gtest-all.o
+ifneq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+ifneq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
+  OBJ_DEFAULT += TestDefaultDeviceType.o TestDefaultDeviceType_a.o TestDefaultDeviceType_b.o TestDefaultDeviceType_c.o TestDefaultDeviceType_d.o
+endif
+endif
+
+TARGETS += KokkosCore_UnitTest_Default
+TEST_TARGETS += test-default
+
+TARGETS += KokkosCore_UnitTest_PushFinalizeHook
+TEST_TARGETS += test-push-finalize-hook
+
+TARGETS += KokkosCore_UnitTest_PushFinalizeHook_terminate
+TEST_TARGETS += test-push-finalize-hook-terminate
+
+NUM_INITTESTS = 16
+INITTESTS_NUMBERS := $(shell seq 1 ${NUM_INITTESTS})
+INITTESTS_TARGETS := $(addprefix KokkosCore_UnitTest_DefaultDeviceTypeInit_,${INITTESTS_NUMBERS})
+TARGETS += ${INITTESTS_TARGETS}
+INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS})
+TEST_TARGETS += ${INITTESTS_TEST_TARGETS}
+
+KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Cuda
+	
+KokkosCore_UnitTest_CudaInterOp: UnitTestMain.o gtest-all.o TestCuda_InterOp.o
+	$(LINK) $(EXTRA_PATH) UnitTestMain.o gtest-all.o TestCuda_InterOp.o $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_CudaInterOp
+	
+KokkosCore_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_ROCm
+
+KokkosCore_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Threads
+
+KokkosCore_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_OpenMP
+
+KokkosCore_UnitTest_OpenMPInterOp: UnitTestMain.o gtest-all.o TestOpenMP_InterOp.o
+	$(LINK) $(EXTRA_PATH) UnitTestMain.o gtest-all.o TestOpenMP_InterOp.o $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_OpenMPInterOp
+
+KokkosCore_UnitTest_OpenMPTarget: $(OBJ_OPENMPTARGET) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMPTARGET) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_OpenMPTarget
+
+KokkosCore_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Serial
+
+KokkosCore_UnitTest_Qthreads: $(OBJ_QTHREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_QTHREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Qthreads
+
+KokkosCore_UnitTest_Qthreads2: $(OBJ_QTHREADS2) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_QTHREADS2) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Qthreads2
+
+KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_HWLOC
+
+KokkosCore_UnitTest_AllocationTracker: $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LIBS) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(LIB) -o KokkosCore_UnitTest_AllocationTracker
+
+KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Default
+
+KokkosCore_UnitTest_PushFinalizeHook: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_PushFinalizeHook
+
+KokkosCore_UnitTest_PushFinalizeHook_terminate: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_PushFinalizeHook_terminate
+
+
+${INITTESTS_TARGETS}: KokkosCore_UnitTest_DefaultDeviceTypeInit_%: TestDefaultDeviceTypeInit_%.o UnitTestMain.o gtest-all.o $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) TestDefaultDeviceTypeInit_$*.o UnitTestMain.o gtest-all.o $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
+
+test-cuda: KokkosCore_UnitTest_Cuda
+	./KokkosCore_UnitTest_Cuda
+	./KokkosCore_UnitTest_CudaInterOp
+
+test-rocm: KokkosCore_UnitTest_ROCm
+	./KokkosCore_UnitTest_ROCm
+
+test-threads: KokkosCore_UnitTest_Threads
+	./KokkosCore_UnitTest_Threads
+
+test-openmp: KokkosCore_UnitTest_OpenMP
+	./KokkosCore_UnitTest_OpenMP
+	./KokkosCore_UnitTest_OpenMPInterOp
+
+test-openmptarget: KokkosCore_UnitTest_OpenMPTarget
+	./KokkosCore_UnitTest_OpenMPTarget
+
+test-serial: KokkosCore_UnitTest_Serial
+	./KokkosCore_UnitTest_Serial
+
+test-qthreads: KokkosCore_UnitTest_Qthreads KokkosCore_UnitTest_Qthreads2
+	./KokkosCore_UnitTest_Qthreads
+	./KokkosCore_UnitTest_Qthreads2
+
+test-hwloc: KokkosCore_UnitTest_HWLOC
+	./KokkosCore_UnitTest_HWLOC
+
+test-allocationtracker: KokkosCore_UnitTest_AllocationTracker
+	./KokkosCore_UnitTest_AllocationTracker
+
+test-default: KokkosCore_UnitTest_Default
+	./KokkosCore_UnitTest_Default
+
+test-push-finalize-hook: KokkosCore_UnitTest_PushFinalizeHook
+	./KokkosCore_UnitTest_PushFinalizeHook
+
+test-push-finalize-hook-terminate: KokkosCore_UnitTest_PushFinalizeHook_terminate
+	./KokkosCore_UnitTest_PushFinalizeHook_terminate
+
+${INITTESTS_TEST_TARGETS}: test-default-init-%: KokkosCore_UnitTest_DefaultDeviceTypeInit_%
+	./KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(TEST_HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
diff --git a/packages/kokkos/core/unit_test/TestAggregate.hpp b/packages/kokkos/core/unit_test/TestAggregate.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..83813b5b1b47667a0648fc0726bfd5f6edbd0634
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestAggregate.hpp
@@ -0,0 +1,129 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TEST_AGGREGATE_HPP
+#define TEST_AGGREGATE_HPP
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <impl/Kokkos_ViewArray.hpp>
+
+namespace Test {
+
+template< class DeviceType >
+void TestViewAggregate()
+{
+  typedef Kokkos::Array< double, 32 >  value_type;
+  typedef Kokkos::Impl::ViewDataAnalysis< value_type *, Kokkos::LayoutLeft, value_type > analysis_1d;
+
+  static_assert( std::is_same< typename analysis_1d::specialize, Kokkos::Array<> >::value, "" );
+
+  typedef Kokkos::ViewTraits< value_type **, DeviceType > a32_traits;
+  typedef Kokkos::ViewTraits< typename a32_traits::scalar_array_type, DeviceType > flat_traits;
+
+  static_assert( std::is_same< typename a32_traits::specialize, Kokkos::Array<> >::value, "" );
+  static_assert( std::is_same< typename a32_traits::value_type, value_type >::value, "" );
+  static_assert( a32_traits::rank == 2, "" );
+  static_assert( a32_traits::rank_dynamic == 2, "" );
+
+  static_assert( std::is_same< typename flat_traits::specialize, void >::value, "" );
+  static_assert( flat_traits::rank == 3, "" );
+  static_assert( flat_traits::rank_dynamic == 2, "" );
+  static_assert( flat_traits::dimension::N2 == 32, "" );
+
+  typedef Kokkos::View< Kokkos::Array< double, 32 > **, DeviceType > a32_type;
+  typedef typename a32_type::array_type  a32_flat_type;
+
+  static_assert( std::is_same< typename a32_type::value_type, value_type >::value, "" );
+  static_assert( std::is_same< typename a32_type::pointer_type, double * >::value, "" );
+  static_assert( a32_type::Rank == 2, "" );
+  static_assert( a32_flat_type::Rank == 3, "" );
+
+  a32_type x( "test", 4, 5 );
+  a32_flat_type y( x );
+
+  ASSERT_EQ( x.extent( 0 ), 4 );
+  ASSERT_EQ( x.extent( 1 ), 5 );
+  ASSERT_EQ( y.extent( 0 ), 4 );
+  ASSERT_EQ( y.extent( 1 ), 5 );
+  ASSERT_EQ( y.extent( 2 ), 32 );
+
+  // Initialize arrays from brace-init-list as for std::array.
+  //
+  // Comment: Clang will issue the following warning if we don't use double
+  //          braces here (one for initializing the Kokkos::Array and one for
+  //          initializing the sub-aggreagate C-array data member),
+  //
+  //            warning: suggest braces around initialization of subobject
+  //
+  //          but single brace syntax would be valid as well.
+  Kokkos::Array< float, 2 > aggregate_initialization_syntax_1 = { { 1.41, 3.14 } };
+  ASSERT_FLOAT_EQ( aggregate_initialization_syntax_1[0], 1.41 );
+  ASSERT_FLOAT_EQ( aggregate_initialization_syntax_1[1], 3.14 );
+
+  Kokkos::Array< int, 3 > aggregate_initialization_syntax_2{ { 0, 1, 2 } }; // since C++11
+  for ( int i = 0; i < 3; ++i ) {
+    ASSERT_EQ( aggregate_initialization_syntax_2[i], i );
+  }
+
+  // Note that this is a valid initialization.
+  Kokkos::Array< double, 3 > initialized_with_one_argument_missing = { { 255, 255 } };
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_DOUBLE_EQ( initialized_with_one_argument_missing[i], 255 );
+  }
+  // But the following line would not compile
+//  Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } };
+}
+
+TEST_F( TEST_CATEGORY, view_aggregate )
+{
+  TestViewAggregate< TEST_EXECSPACE >();
+}
+
+} // namespace Test
+
+#endif /* #ifndef TEST_AGGREGATE_HPP */
diff --git a/packages/kokkos/core/unit_test/TestAtomic.hpp b/packages/kokkos/core/unit_test/TestAtomic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..915e1e8d979dcb3948d3346ec6770837163ea213
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestAtomic.hpp
@@ -0,0 +1,484 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace TestAtomic {
+
+// Struct for testing arbitrary size atomics.
+
+template< int N >
+struct SuperScalar {
+  double val[N];
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar() {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] = 0.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar( const SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] = src.val[i];
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar( const volatile SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] = src.val[i];
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator=( const SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] = src.val[i];
+    }
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator=( const volatile SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] = src.val[i];
+    }
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator=( const SuperScalar & src ) volatile  {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] = src.val[i];
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar operator+( const SuperScalar & src ) {
+    SuperScalar tmp = *this;
+    for ( int i = 0; i < N; i++ ) {
+      tmp.val[i] += src.val[i];
+    }
+    return tmp;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator+=( const double & src ) {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] += 1.0 * ( i + 1 ) * src;
+    }
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator+=( const SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] += src.val[i];
+    }
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator==( const SuperScalar & src ) {
+    bool compare = true;
+    for( int i = 0; i < N; i++ ) {
+      compare = compare && ( val[i] == src.val[i] );
+    }
+    return compare;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator!=( const SuperScalar & src ) {
+    bool compare = true;
+    for ( int i = 0; i < N; i++ ) {
+      compare = compare && ( val[i] == src.val[i] );
+    }
+    return !compare;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar( const double & src ) {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] = 1.0 * ( i + 1 ) * src;
+    }
+  }
+};
+
+template< int N >
+std::ostream & operator<<( std::ostream & os, const SuperScalar< N > & dt )
+{
+  os << "{ ";
+  for ( int  i = 0; i < N - 1; i++ ) {
+     os << dt.val[i] << ", ";
+  }
+  os << dt.val[N-1] << "}";
+
+  return os;
+}
+
+template< class T, class DEVICE_TYPE >
+struct ZeroFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View< T, execution_space > type;
+  typedef typename Kokkos::View< T, execution_space >::HostMirror h_type;
+
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    data() = 0;
+  }
+};
+
+//---------------------------------------------------
+//--------------atomic_fetch_add---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct AddFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_add( &data(), (T) 1 );
+  }
+};
+
+template< class T, class execution_space >
+T AddLoop( int loop ) {
+  struct ZeroFunctor< T, execution_space > f_zero;
+  typename ZeroFunctor< T, execution_space >::type data( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_zero.data = data;
+  Kokkos::parallel_for( 1, f_zero );
+  execution_space::fence();
+
+  struct AddFunctor< T, execution_space > f_add;
+
+  f_add.data = data;
+  Kokkos::parallel_for( loop, f_add );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T AddLoopSerial( int loop ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for ( int i = 0; i < loop; i++ ) {
+    *data += (T) 1;
+  }
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+//------------------------------------------------------
+//--------------atomic_compare_exchange-----------------
+//------------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct CASFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    T old = data();
+    T newval, assumed;
+
+    do {
+      assumed = old;
+      newval = assumed + (T) 1;
+      old = Kokkos::atomic_compare_exchange( &data(), assumed, newval );
+    } while( old != assumed );
+  }
+};
+
+template< class T, class execution_space >
+T CASLoop( int loop ) {
+  struct ZeroFunctor< T, execution_space > f_zero;
+  typename ZeroFunctor< T, execution_space >::type data( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_zero.data = data;
+  Kokkos::parallel_for( 1, f_zero );
+  execution_space::fence();
+
+  struct CASFunctor< T, execution_space > f_cas;
+
+  f_cas.data = data;
+  Kokkos::parallel_for( loop, f_cas );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T CASLoopSerial( int loop ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for ( int i = 0; i < loop; i++ ) {
+    T assumed;
+    T newval;
+    T old;
+
+    do {
+      assumed = *data;
+      newval = assumed + (T) 1;
+      old = *data;
+      *data = newval;
+    } while( !( assumed == old ) );
+  }
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+//----------------------------------------------
+//--------------atomic_exchange-----------------
+//----------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct ExchFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data, data2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const {
+    T old = Kokkos::atomic_exchange( &data(), (T) i );
+    Kokkos::atomic_fetch_add( &data2(), old );
+  }
+};
+
+template< class T, class execution_space >
+T ExchLoop( int loop ) {
+  struct ZeroFunctor< T, execution_space > f_zero;
+  typename ZeroFunctor< T, execution_space >::type data( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_zero.data = data;
+  Kokkos::parallel_for( 1, f_zero );
+  execution_space::fence();
+
+  typename ZeroFunctor< T, execution_space >::type data2( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data2( "HData" );
+
+  f_zero.data = data2;
+  Kokkos::parallel_for( 1, f_zero );
+  execution_space::fence();
+
+  struct ExchFunctor< T, execution_space > f_exch;
+
+  f_exch.data = data;
+  f_exch.data2 = data2;
+  Kokkos::parallel_for( loop, f_exch );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  Kokkos::deep_copy( h_data2, data2 );
+  T val = h_data() + h_data2();
+
+  return val;
+}
+
+template< class T >
+T ExchLoopSerial( typename std::conditional< !std::is_same< T, Kokkos::complex<double> >::value, int, void >::type loop ) {
+  T* data = new T[1];
+  T* data2 = new T[1];
+  data[0] = 0;
+  data2[0] = 0;
+
+  for ( int i = 0; i < loop; i++ ) {
+    T old = *data;
+    *data = (T) i;
+    *data2 += old;
+  }
+
+  T val = *data2 + *data;
+  delete [] data;
+  delete [] data2;
+
+  return val;
+}
+
+template< class T >
+T ExchLoopSerial( typename std::conditional< std::is_same< T, Kokkos::complex<double> >::value, int, void >::type loop ) {
+  T* data = new T[1];
+  T* data2 = new T[1];
+  data[0] = 0;
+  data2[0] = 0;
+
+  for ( int i = 0; i < loop; i++ ) {
+    T old = *data;
+    data->real() = ( static_cast<double>( i ) );
+    data->imag() = 0;
+    *data2 += old;
+  }
+
+  T val = *data2 + *data;
+  delete [] data;
+  delete [] data2;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+T LoopVariant( int loop, int test ) {
+  switch ( test ) {
+    case 1: return AddLoop< T, DeviceType >( loop );
+    case 2: return CASLoop< T, DeviceType >( loop );
+    case 3: return ExchLoop< T, DeviceType >( loop );
+  }
+
+  return 0;
+}
+
+template< class T >
+T LoopVariantSerial( int loop, int test ) {
+  switch ( test ) {
+    case 1: return AddLoopSerial< T >( loop );
+    case 2: return CASLoopSerial< T >( loop );
+    case 3: return ExchLoopSerial< T >( loop );
+  }
+
+  return 0;
+}
+
+template< class T, class DeviceType >
+bool Loop( int loop, int test )
+{
+  T res       = LoopVariant< T, DeviceType >( loop, test );
+  T resSerial = LoopVariantSerial< T >( loop, test );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = "
+              << test << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+} // namespace TestAtomic
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, atomics )
+{
+  const int loop_count = 1e4;
+
+  ASSERT_TRUE( ( TestAtomic::Loop< int, TEST_EXECSPACE >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, TEST_EXECSPACE >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, TEST_EXECSPACE >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, TEST_EXECSPACE >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, TEST_EXECSPACE >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, TEST_EXECSPACE >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, TEST_EXECSPACE >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, TEST_EXECSPACE >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, TEST_EXECSPACE >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, TEST_EXECSPACE >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, TEST_EXECSPACE >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, TEST_EXECSPACE >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, TEST_EXECSPACE >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, TEST_EXECSPACE >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, TEST_EXECSPACE >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< double, TEST_EXECSPACE >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, TEST_EXECSPACE >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, TEST_EXECSPACE >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< float, TEST_EXECSPACE >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, TEST_EXECSPACE >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, TEST_EXECSPACE >( 100, 3 ) ) );
+
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+#ifndef KOKKOS_ENABLE_ROCM
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 3 ) ) );
+#endif
+#endif
+}
+
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b90ac3cd135e32ad1b6477aab16d08faeef4094
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
@@ -0,0 +1,1140 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace TestAtomicOperations {
+
+//-----------------------------------------------
+//--------------zero_functor---------------------
+//-----------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct ZeroFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View< T, execution_space > type;
+  typedef typename Kokkos::View< T, execution_space >::HostMirror h_type;
+
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    data() = 0;
+  }
+};
+
+//-----------------------------------------------
+//--------------init_functor---------------------
+//-----------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct InitFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View< T, execution_space > type;
+  typedef typename Kokkos::View< T, execution_space >::HostMirror h_type;
+
+  type data;
+  T init_value;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    data() = init_value;
+  }
+
+  InitFunctor( T _init_value ) : init_value( _init_value ) {}
+};
+
+//---------------------------------------------------
+//--------------atomic_fetch_max---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct MaxFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    //Kokkos::atomic_fetch_max( &data(), (T) 1 );
+    Kokkos::atomic_fetch_max( &data(), (T) i1 );
+  }
+  MaxFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
+};
+
+template< class T, class execution_space >
+T MaxAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct MaxFunctor< T, execution_space > f( i0, i1 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T MaxAtomicCheck( T i0, T i1 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = ( i0 > i1 ? i0 : i1 );
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool MaxAtomicTest( T i0, T i1 )
+{
+  T res       = MaxAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = MaxAtomicCheck<T>( i0, i1 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = MaxAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_min---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct MinFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_min( &data(), (T) i1 );
+  }
+
+  MinFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
+};
+
+template< class T, class execution_space >
+T MinAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct MinFunctor< T, execution_space > f( i0, i1 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T MinAtomicCheck( T i0, T i1 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = ( i0 < i1 ? i0 : i1 );
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool MinAtomicTest( T i0, T i1 )
+{
+  T res       = MinAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = MinAtomicCheck< T >( i0, i1 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = MinAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_increment---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct IncFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_increment( &data() );
+  }
+
+  IncFunctor( T _i0 ) : i0( _i0 ) {}
+};
+
+template< class T, class execution_space >
+T IncAtomic( T i0 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct IncFunctor< T, execution_space > f( i0 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T IncAtomicCheck( T i0 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0 + 1;
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool IncAtomicTest( T i0 )
+{
+  T res       = IncAtomic< T, DeviceType >( i0 );
+  T resSerial = IncAtomicCheck< T >( i0 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = IncAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_decrement---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct DecFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_decrement( &data() );
+  }
+
+  DecFunctor( T _i0 ) : i0( _i0 ) {}
+};
+
+template< class T, class execution_space >
+T DecAtomic( T i0 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct DecFunctor< T, execution_space > f( i0 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T DecAtomicCheck( T i0 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0 - 1;
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool DecAtomicTest( T i0 )
+{
+  T res       = DecAtomic< T, DeviceType >( i0 );
+  T resSerial = DecAtomicCheck< T >( i0 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = DecAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_mul---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct MulFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_mul( &data(), (T) i1 );
+  }
+
+  MulFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
+};
+
+template< class T, class execution_space >
+T MulAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct MulFunctor< T, execution_space > f( i0, i1 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T MulAtomicCheck( T i0, T i1 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0*i1;
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool MulAtomicTest( T i0, T i1 )
+{
+  T res       = MulAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = MulAtomicCheck< T >( i0, i1 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = MulAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_div---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct DivFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_div( &data(), (T) i1 );
+  }
+
+  DivFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
+};
+
+template< class T, class execution_space >
+T DivAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct DivFunctor< T, execution_space > f( i0, i1 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T DivAtomicCheck( T i0, T i1 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0 / i1;
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool DivAtomicTest( T i0, T i1 )
+{
+  T res       = DivAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = DivAtomicCheck< T >( i0, i1 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = DivAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_mod---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct ModFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_mod( &data(), (T) i1 );
+  }
+
+  ModFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
+};
+
+template< class T, class execution_space >
+T ModAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct ModFunctor< T, execution_space > f( i0, i1 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T ModAtomicCheck( T i0, T i1 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0 % i1;
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool ModAtomicTest( T i0, T i1 )
+{
+  T res       = ModAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = ModAtomicCheck< T >( i0, i1 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = ModAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_and---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct AndFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_and( &data(), (T) i1 );
+  }
+
+  AndFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
+};
+
+template< class T, class execution_space >
+T AndAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct AndFunctor< T, execution_space > f( i0, i1 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T AndAtomicCheck( T i0, T i1 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0 & i1;
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool AndAtomicTest( T i0, T i1 )
+{
+  T res       = AndAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = AndAtomicCheck< T >( i0, i1 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = AndAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_or----------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct OrFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_or( &data(), (T) i1 );
+  }
+
+  OrFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
+};
+
+template< class T, class execution_space >
+T OrAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct OrFunctor< T, execution_space > f( i0, i1 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T OrAtomicCheck( T i0, T i1 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0 | i1;
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool OrAtomicTest( T i0, T i1 )
+{
+  T res       = OrAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = OrAtomicCheck< T >( i0, i1 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = OrAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_xor---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct XorFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_xor( &data(), (T) i1 );
+  }
+
+  XorFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
+};
+
+template< class T, class execution_space >
+T XorAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct XorFunctor< T, execution_space > f( i0, i1 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T XorAtomicCheck( T i0, T i1 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0 ^ i1;
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool XorAtomicTest( T i0, T i1 )
+{
+  T res       = XorAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = XorAtomicCheck< T >( i0, i1 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = XorAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_lshift---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct LShiftFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_lshift( &data(), (T) i1 );
+  }
+
+  LShiftFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
+};
+
+template< class T, class execution_space >
+T LShiftAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct LShiftFunctor< T, execution_space > f( i0, i1 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T LShiftAtomicCheck( T i0, T i1 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0 << i1;
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool LShiftAtomicTest( T i0, T i1 )
+{
+  T res       = LShiftAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = LShiftAtomicCheck< T >( i0, i1 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = LShiftAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_rshift---------------------
+//---------------------------------------------------
+
+template< class T, class DEVICE_TYPE >
+struct RShiftFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View< T, execution_space > type;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_rshift( &data(), (T) i1 );
+  }
+
+  RShiftFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
+};
+
+template< class T, class execution_space >
+T RShiftAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
+  f_init.data = data;
+  Kokkos::parallel_for( 1, f_init );
+  execution_space::fence();
+
+  struct RShiftFunctor< T, execution_space > f( i0, i1 );
+
+  f.data = data;
+  Kokkos::parallel_for( 1, f );
+  execution_space::fence();
+
+  Kokkos::deep_copy( h_data, data );
+  T val = h_data();
+
+  return val;
+}
+
+template< class T >
+T RShiftAtomicCheck( T i0, T i1 ) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0 >> i1;
+
+  T val = *data;
+  delete [] data;
+
+  return val;
+}
+
+template< class T, class DeviceType >
+bool RShiftAtomicTest( T i0, T i1 )
+{
+  T res       = RShiftAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = RShiftAtomicCheck< T >( i0, i1 );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = RShiftAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//--------------atomic_test_control------------------
+//---------------------------------------------------
+
+template< class T, class DeviceType >
+bool AtomicOperationsTestIntegralType( int i0, int i1, int test )
+{
+  switch ( test ) {
+    case 1: return MaxAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 2: return MinAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 3: return MulAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 4: return DivAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 5: return ModAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 6: return AndAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 7: return OrAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 8: return XorAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 9: return LShiftAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 10: return RShiftAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 11: return IncAtomicTest< T, DeviceType >( (T) i0 );
+    case 12: return DecAtomicTest< T, DeviceType >( (T) i0 );
+  }
+
+  return 0;
+}
+
+template< class T, class DeviceType >
+bool AtomicOperationsTestNonIntegralType( int i0, int i1, int test )
+{
+  switch ( test ) {
+    case 1: return MaxAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 2: return MinAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 3: return MulAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 4: return DivAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+  }
+
+  return 0;
+}
+
+}
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY , atomic_operations )
+{
+  const int start = 1; // Avoid zero for division.
+  const int end = 11;
+  for ( int i = start; i < end; ++i )
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, TEST_EXECSPACE >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, TEST_EXECSPACE >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, TEST_EXECSPACE >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, TEST_EXECSPACE >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, TEST_EXECSPACE >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, TEST_EXECSPACE >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, TEST_EXECSPACE >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, TEST_EXECSPACE >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, TEST_EXECSPACE >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, TEST_EXECSPACE >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, TEST_EXECSPACE >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, TEST_EXECSPACE >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, TEST_EXECSPACE >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, TEST_EXECSPACE >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, TEST_EXECSPACE >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, TEST_EXECSPACE >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, TEST_EXECSPACE >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, TEST_EXECSPACE >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, TEST_EXECSPACE >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, TEST_EXECSPACE >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, TEST_EXECSPACE >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, TEST_EXECSPACE >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, TEST_EXECSPACE >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, TEST_EXECSPACE >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, TEST_EXECSPACE >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, TEST_EXECSPACE >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, TEST_EXECSPACE >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, TEST_EXECSPACE >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, TEST_EXECSPACE >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, TEST_EXECSPACE >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, TEST_EXECSPACE >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, TEST_EXECSPACE >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, TEST_EXECSPACE >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, TEST_EXECSPACE >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, TEST_EXECSPACE >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, TEST_EXECSPACE >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, TEST_EXECSPACE >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, TEST_EXECSPACE >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, TEST_EXECSPACE >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, TEST_EXECSPACE >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, TEST_EXECSPACE >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, TEST_EXECSPACE >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, TEST_EXECSPACE >( start, end - i, 4 ) ) );
+  }
+}
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicViews.hpp b/packages/kokkos/core/unit_test/TestAtomicViews.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..38b49730d41164811f1b9ece85ee6e573f597fa8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestAtomicViews.hpp
@@ -0,0 +1,1480 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace TestAtomicViews {
+
+//-------------------------------------------------
+//-----------atomic view api tests-----------------
+//-------------------------------------------------
+
+template< class T, class ... P >
+size_t allocation_count( const Kokkos::View< T, P... > & view )
+{
+  const size_t card  = view.size();
+  const size_t alloc = view.span();
+
+  const int memory_span = Kokkos::View< int* >::required_allocation_size( 100 );
+
+  return ( card <= alloc && memory_span == 400 ) ? alloc : 0;
+}
+
+template< class DataType,
+          class DeviceType,
+          unsigned Rank = Kokkos::ViewTraits< DataType >::rank >
+struct TestViewOperator_LeftAndRight;
+
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 1 >
+{
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
+
+  typedef int value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update,
+                    const volatile value_type & input )
+    { update |= input; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0; }
+
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > left_view;
+
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > right_view;
+
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > stride_view;
+
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver;
+
+    int error_flag = 0;
+
+    Kokkos::parallel_reduce( 1, driver, error_flag );
+
+    ASSERT_EQ( error_flag, 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type, value_type & update ) const
+  {
+    for ( unsigned i0 = 0; i0 < unsigned( left.extent(0) ); ++i0 )
+    {
+      // Below checks that values match, but unable to check the references.
+      // Should this be able to be checked?
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+        if ( left( i0 )  != left( i0, 0, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+        if ( right( i0 ) != right( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+#else
+        if ( left( i0 )  != left.access( i0, 0, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+        if ( right( i0 ) != right.access( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+#endif
+      if ( left( i0 )  != left_stride( i0 ) ) { update |= 4; }
+      if ( right( i0 ) != right_stride( i0 ) ) { update |= 8; }
+/*
+      if ( &left( i0 )  != &left( i0, 0, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( &right( i0 ) != &right( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+      if ( &left( i0 )  != &left_stride( i0 ) ) { update |= 4; }
+      if ( &right( i0 ) != &right_stride( i0 ) ) { update |= 8; }
+*/
+    }
+  }
+};
+
+template< typename T, class DeviceType >
+class TestAtomicViewAPI
+{
+public:
+  typedef DeviceType device;
+
+  enum { N0 = 1000,
+         N1 = 3,
+         N2 = 5,
+         N3 = 7 };
+
+  typedef Kokkos::View< T, device > dView0;
+  typedef Kokkos::View< T*, device > dView1;
+  typedef Kokkos::View< T*[N1], device > dView2;
+  typedef Kokkos::View< T*[N1][N2], device > dView3;
+  typedef Kokkos::View< T*[N1][N2][N3], device > dView4;
+  typedef Kokkos::View< const T*[N1][N2][N3], device > const_dView4;
+  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged;
+  typedef typename dView0::host_mirror_space host;
+
+  typedef Kokkos::View< T, device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView0;
+  typedef Kokkos::View< T*, device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView1;
+  typedef Kokkos::View< T*[N1], device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView2;
+  typedef Kokkos::View< T*[N1][N2], device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView3;
+  typedef Kokkos::View< T*[N1][N2][N3], device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView4;
+  typedef Kokkos::View< const T*[N1][N2][N3], device, Kokkos::MemoryTraits< Kokkos::Atomic > > const_aView4;
+
+  typedef Kokkos::View< T****, device, Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::Atomic > > aView4_unmanaged;
+
+  typedef typename aView0::host_mirror_space host_atomic;
+
+  TestAtomicViewAPI()
+  {
+    TestViewOperator_LeftAndRight< int[2], device >::testit();
+    run_test_rank0();
+    run_test_rank4();
+    run_test_const();
+  }
+
+  static void run_test_rank0()
+  {
+    dView0 dx, dy;
+    aView0 ax, ay, az;
+
+    dx = dView0( "dx" );
+    dy = dView0( "dy" );
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 1 ) );
+
+    ax = dx;
+    ay = dy;
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dx.use_count(), ax.use_count() );
+
+    az = ax;
+    ASSERT_EQ( dx.use_count(), size_t( 3 ) );
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), ax.use_count() );
+  }
+
+  static void run_test_rank4()
+  {
+    dView4 dx, dy;
+    aView4 ax, ay, az;
+
+    dx = dView4( "dx", N0 );
+    dy = dView4( "dy", N0 );
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 1 ) );
+
+    ax = dx;
+    ay = dy;
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dx.use_count(), ax.use_count() );
+
+    dView4_unmanaged unmanaged_dx = dx;
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
+
+    az = ax;
+    ASSERT_EQ( dx.use_count(), size_t( 3 ) );
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), ax.use_count() );
+
+    aView4_unmanaged unmanaged_ax = ax;
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
+
+    aView4_unmanaged unmanaged_ax_from_ptr_dx =
+      aView4_unmanaged( dx.data(), dx.extent(0), dx.extent(1), dx.extent(2), dx.extent(3) );
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
+
+    const_aView4 const_ax = ax;
+    ASSERT_EQ( ax.use_count(), size_t( 4 ) );
+    ASSERT_EQ( const_ax.use_count(), ax.use_count() );
+
+    ASSERT_FALSE( ax.data() == 0 );
+    ASSERT_FALSE( const_ax.data() == 0 ); // referenceable ptr
+    ASSERT_FALSE( unmanaged_ax.data() == 0 );
+    ASSERT_FALSE( unmanaged_ax_from_ptr_dx.data() == 0 );
+    ASSERT_FALSE( ay.data() == 0 );
+//    ASSERT_NE( ax, ay );
+//    Above test results in following runtime error from gtest:
+//    Expected: (ax) != (ay), actual: 32-byte object <30-01 D0-A0 D8-7F 00-00 00-31 44-0C 01-00 00-00 E8-03 00-00 00-00 00-00 69-00 00-00 00-00 00-00> vs 32-byte object <80-01 D0-A0 D8-7F 00-00 00-A1 4A-0C 01-00 00-00 E8-03 00-00 00-00 00-00 69-00 00-00 00-00 00-00>
+
+    ASSERT_EQ( ax.extent(0), unsigned( N0 ) );
+    ASSERT_EQ( ax.extent(1), unsigned( N1 ) );
+    ASSERT_EQ( ax.extent(2), unsigned( N2 ) );
+    ASSERT_EQ( ax.extent(3), unsigned( N3 ) );
+
+    ASSERT_EQ( ay.extent(0), unsigned( N0 ) );
+    ASSERT_EQ( ay.extent(1), unsigned( N1 ) );
+    ASSERT_EQ( ay.extent(2), unsigned( N2 ) );
+    ASSERT_EQ( ay.extent(3), unsigned( N3 ) );
+
+    ASSERT_EQ( unmanaged_ax_from_ptr_dx.span(), unsigned( N0 ) * unsigned( N1 ) * unsigned( N2 ) * unsigned( N3 ) );
+  }
+
+  typedef T DataType[2];
+
+  static void
+  check_auto_conversion_to_const(
+     const Kokkos::View< const DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > & arg_const,
+     const Kokkos::View< const DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > & arg )
+  {
+    ASSERT_TRUE( arg_const == arg );
+  }
+
+  static void run_test_const()
+  {
+    typedef Kokkos::View< DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > typeX;
+    typedef Kokkos::View< const DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > const_typeX;
+
+    typeX x( "X" );
+    const_typeX xc = x;
+
+    //ASSERT_TRUE( xc == x ); // const xc is referenceable, non-const x is not
+    //ASSERT_TRUE( x == xc );
+
+    check_auto_conversion_to_const( x, xc );
+  }
+};
+
+//---------------------------------------------------
+//-----------initialization functors-----------------
+//---------------------------------------------------
+
+template<class T, class execution_space >
+struct InitFunctor_Seq {
+  typedef Kokkos::View< T*, execution_space > view_type;
+
+  view_type input;
+  const long length;
+
+  InitFunctor_Seq( view_type & input_, const long length_ )
+    : input( input_ )
+    , length( length_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length ) {
+      input( i ) = (T) i;
+    }
+  }
+};
+
+template<class T, class execution_space >
+struct InitFunctor_ModTimes {
+  typedef Kokkos::View< T*, execution_space > view_type;
+
+  view_type input;
+  const long length;
+  const long remainder;
+
+  InitFunctor_ModTimes( view_type & input_, const long length_, const long remainder_ )
+    : input( input_ )
+    , length( length_ )
+    , remainder( remainder_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length ) {
+      if ( i % ( remainder + 1 ) == remainder ) {
+        input( i ) = (T) 2;
+      }
+      else {
+        input( i ) = (T) 1;
+      }
+    }
+  }
+};
+
+template<class T, class execution_space >
+struct InitFunctor_ModShift {
+  typedef Kokkos::View< T*, execution_space > view_type;
+
+  view_type input;
+  const long length;
+  const long remainder;
+
+  InitFunctor_ModShift( view_type & input_, const long length_, const long remainder_ )
+    : input( input_ )
+    , length( length_ )
+    , remainder( remainder_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length ) {
+      if ( i % ( remainder + 1 ) == remainder ) {
+        input( i ) = 1;
+      }
+    }
+  }
+};
+
+//---------------------------------------------------
+//-----------atomic view plus-equal------------------
+//---------------------------------------------------
+
+template<class T, class execution_space >
+struct PlusEqualAtomicViewFunctor {
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+
+  view_type input;
+  atomic_view_type even_odd_result;
+  const long length;
+
+  // Wrap the result view in an atomic view, use this for operator
+  PlusEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length ) {
+      if ( i % 2 == 0 ) {
+        even_odd_result( 0 ) += input( i );
+      }
+      else {
+        even_odd_result( 1 ) += input( i );
+      }
+    }
+  }
+};
+
+template< class T, class execution_space >
+T PlusEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  const long length = input_length;
+
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
+
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
+
+  PlusEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>( 0, length ), functor );
+  Kokkos::fence();
+
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
+
+  return (T) ( h_result_view( 0 ) + h_result_view( 1 ) );
+}
+
+template< class T >
+T PlusEqualAtomicViewCheck( const long input_length ) {
+  const long N = input_length;
+  T result[2];
+
+  if ( N % 2 == 0 ) {
+    const long half_sum_end = ( N / 2 ) - 1;
+    const long full_sum_end = N - 1;
+    result[0] = half_sum_end * ( half_sum_end + 1 ) / 2; // Even sum.
+    result[1] = ( full_sum_end * ( full_sum_end + 1 ) / 2 ) - result[0]; // Odd sum.
+  }
+  else {
+    const long half_sum_end = (T) ( N / 2 );
+    const long full_sum_end = N - 2;
+    result[0] = half_sum_end * ( half_sum_end - 1 ) / 2; // Even sum.
+    result[1] = ( full_sum_end * ( full_sum_end - 1 ) / 2 ) - result[0]; // Odd sum.
+  }
+
+  return (T) ( result[0] + result[1] );
+}
+
+template< class T, class DeviceType >
+bool PlusEqualAtomicViewTest( long input_length )
+{
+  T res       = PlusEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = PlusEqualAtomicViewCheck< T >( input_length );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = PlusEqualAtomicViewTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//-----------atomic view minus-equal-----------------
+//---------------------------------------------------
+
+template<class T, class execution_space >
+struct MinusEqualAtomicViewFunctor {
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+
+  view_type input;
+  atomic_view_type even_odd_result;
+  const long length;
+
+  // Wrap the result view in an atomic view, use this for operator.
+  MinusEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length ) {
+      if ( i % 2 == 0 ) {
+        even_odd_result( 0 ) -= input( i );
+      }
+      else {
+        even_odd_result( 1 ) -= input( i );
+      }
+    }
+  }
+};
+
+template< class T, class execution_space >
+T MinusEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  const long length = input_length;
+
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
+
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
+
+  MinusEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
+  Kokkos::fence();
+
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
+
+  return (T) ( h_result_view( 0 ) + h_result_view( 1 ) );
+}
+
+template< class T >
+T MinusEqualAtomicViewCheck( const long input_length ) {
+  const long N = input_length;
+  T result[2];
+
+  if ( N % 2 == 0 ) {
+    const long half_sum_end = ( N / 2 ) - 1;
+    const long full_sum_end = N - 1;
+    result[0] = -1 * ( half_sum_end * ( half_sum_end + 1 ) / 2 ); // Even sum.
+    result[1] = -1 * ( ( full_sum_end * ( full_sum_end + 1 ) / 2 ) + result[0] ); // Odd sum.
+  }
+  else {
+    const long half_sum_end = (long) ( N / 2 );
+    const long full_sum_end = N - 2;
+    result[0] = -1 * ( half_sum_end * ( half_sum_end - 1 ) / 2 ); // Even sum.
+    result[1] = -1 * ( ( full_sum_end * ( full_sum_end - 1 ) / 2 ) + result[0] ); // Odd sum.
+  }
+
+  return ( result[0] + result[1] );
+}
+
+template< class T, class DeviceType >
+bool MinusEqualAtomicViewTest( long input_length )
+{
+  T res       = MinusEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = MinusEqualAtomicViewCheck< T >( input_length );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = MinusEqualAtomicViewTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//-----------atomic view times-equal-----------------
+//---------------------------------------------------
+
+template<class T, class execution_space >
+struct TimesEqualAtomicViewFunctor {
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+
+  view_type input;
+  atomic_view_type result;
+  const long length;
+
+  // Wrap the result view in an atomic view, use this for operator
+  TimesEqualAtomicViewFunctor( const view_type & input_, view_type & result_, const long length_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length && i > 0 ) {
+      result( 0 ) *= (double) input( i );
+    }
+  }
+};
+
+template< class T, class execution_space >
+T TimesEqualAtomicView( const long input_length, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  const long length = input_length;
+
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 1 );
+  deep_copy( result_view, 1.0 );
+
+  InitFunctor_ModTimes< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
+
+  TimesEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
+  Kokkos::fence();
+
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
+
+  return (T) ( h_result_view( 0 ) );
+}
+
+template< class T >
+T TimesEqualAtomicViewCheck( const long input_length, const long remainder ) {
+  // Analytical result.
+  const long N = input_length;
+  T result = 1.0;
+
+  for ( long i = 2; i < N; ++i ) {
+    if ( i % ( remainder + 1 ) == remainder ) {
+      result *= 2.0;
+    }
+    else {
+      result *= 1.0;
+    }
+  }
+
+  return (T) result;
+}
+
+template< class T, class DeviceType>
+bool TimesEqualAtomicViewTest( const long input_length )
+{
+  const long remainder = 23;
+  T res       = TimesEqualAtomicView< T, DeviceType >( input_length, remainder );
+  T resSerial = TimesEqualAtomicViewCheck< T >( input_length, remainder );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = TimesEqualAtomicViewTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//------------atomic view div-equal------------------
+//---------------------------------------------------
+
+template<class T, class execution_space >
+struct DivEqualAtomicViewFunctor {
+  typedef Kokkos::View< T, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
+
+  view_type input;
+  atomic_view_type result;
+  const long length;
+
+  // Wrap the result view in an atomic view, use this for operator.
+  DivEqualAtomicViewFunctor( const view_type & input_, scalar_view_type & result_, const long length_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length && i > 0 ) {
+      result() /= (double) ( input( i ) );
+    }
+  }
+};
+
+template< class T, class execution_space >
+T DivEqualAtomicView( const long input_length, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
+  typedef typename scalar_view_type::HostMirror host_scalar_view_type;
+
+  const long length = input_length;
+
+  view_type input( "input_view", length );
+  scalar_view_type result_view( "result_view" );
+  Kokkos::deep_copy( result_view, 12121212121 );
+
+  InitFunctor_ModTimes< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
+
+  DivEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
+  Kokkos::fence();
+
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
+
+  return (T) ( h_result_view() );
+}
+
+template< class T >
+T DivEqualAtomicViewCheck( const long input_length, const long remainder ) {
+  const long N = input_length;
+  T result = 12121212121.0;
+  for ( long i = 2; i < N; ++i ) {
+    if ( i % ( remainder + 1 ) == remainder ) {
+      result /= 1.0;
+    }
+    else {
+      result /= 2.0;
+    }
+  }
+
+  return (T) result;
+}
+
+template< class T, class DeviceType >
+bool DivEqualAtomicViewTest( const long input_length )
+{
+  const long remainder = 23;
+
+  T res       = DivEqualAtomicView< T, DeviceType >( input_length, remainder );
+  T resSerial = DivEqualAtomicViewCheck< T >( input_length, remainder );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = DivEqualAtomicViewTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//------------atomic view mod-equal------------------
+//---------------------------------------------------
+
+template< class T, class execution_space >
+struct ModEqualAtomicViewFunctor {
+  typedef Kokkos::View< T, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
+
+  view_type input;
+  atomic_view_type result;
+  const long length;
+
+  // Wrap the result view in an atomic view, use this for operator.
+  ModEqualAtomicViewFunctor( const view_type & input_, scalar_view_type & result_, const long length_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length && i > 0 ) {
+      result() %= (double) ( input( i ) );
+    }
+  }
+};
+
+template< class T, class execution_space >
+T ModEqualAtomicView( const long input_length, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
+  typedef typename scalar_view_type::HostMirror host_scalar_view_type;
+
+  const long length = input_length;
+
+  view_type input( "input_view", length );
+  scalar_view_type result_view( "result_view" );
+  Kokkos::deep_copy( result_view, 12121212121 );
+
+  InitFunctor_ModTimes< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
+
+  ModEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
+  Kokkos::fence();
+
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
+
+  return (T) ( h_result_view() );
+}
+
+template< class T >
+T ModEqualAtomicViewCheck( const long input_length, const long remainder ) {
+  const long N = input_length;
+  T result = 12121212121;
+  for ( long i = 2; i < N; ++i ) {
+    if ( i % ( remainder + 1 ) == remainder ) {
+      result %= 1;
+    }
+    else {
+      result %= 2;
+    }
+  }
+
+  return (T) result;
+}
+
+template< class T, class DeviceType >
+bool ModEqualAtomicViewTest( const long input_length )
+{
+  static_assert( std::is_integral< T >::value, "ModEqualAtomicView Error: Type must be integral type for this unit test" );
+
+  const long remainder = 23;
+
+  T res       = ModEqualAtomicView< T, DeviceType >( input_length, remainder );
+  T resSerial = ModEqualAtomicViewCheck< T >( input_length, remainder );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = ModEqualAtomicViewTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//------------atomic view rs-equal------------------
+//---------------------------------------------------
+
+template< class T, class execution_space >
+struct RSEqualAtomicViewFunctor {
+  typedef Kokkos::View< T****, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
+
+  const view_type input;
+  atomic_view_type result;
+  const long length;
+  const long value;
+
+  // Wrap the result view in an atomic view, use this for operator.
+  RSEqualAtomicViewFunctor( const view_type & input_, result_view_type & result_, const long & length_, const long & value_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
+    , value( value_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length ) {
+      if ( i % 4 == 0 ) {
+        result( 1, 0, 0, 0 ) >>= input( i );
+      }
+      else if ( i % 4 == 1 ) {
+        result( 0, 1, 0, 0 ) >>= input( i );
+      }
+      else if ( i % 4 == 2 ) {
+        result( 0, 0, 1, 0 ) >>= input( i );
+      }
+      else if ( i % 4 == 3 ) {
+        result( 0, 0, 0, 1 ) >>= input( i );
+      }
+    }
+  }
+};
+
+template< class T, class execution_space >
+T RSEqualAtomicView( const long input_length, const long value, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
+  typedef typename result_view_type::HostMirror host_scalar_view_type;
+
+  const long length = input_length;
+
+  view_type input( "input_view", length );
+  result_view_type result_view( "result_view", 2, 2, 2, 2 );
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  h_result_view( 1, 0, 0, 0 ) = value;
+  h_result_view( 0, 1, 0, 0 ) = value;
+  h_result_view( 0, 0, 1, 0 ) = value;
+  h_result_view( 0, 0, 0, 1 ) = value;
+  Kokkos::deep_copy( result_view, h_result_view );
+
+  InitFunctor_ModShift< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
+
+  RSEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length, value );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
+  Kokkos::fence();
+
+  Kokkos::deep_copy( h_result_view, result_view );
+
+  return (T) ( h_result_view( 1, 0, 0, 0 ) );
+}
+
+template< class T >
+T RSEqualAtomicViewCheck( const long input_length, const long value, const long remainder ) {
+  T result[4];
+  result[0] = value;
+  result[1] = value;
+  result[2] = value;
+  result[3] = value;
+
+  T * input = new T[input_length];
+  for ( long i = 0; i < input_length; ++i ) {
+    if ( i % ( remainder + 1 ) == remainder ) {
+      input[i] = 1;
+    }
+    else {
+      input[i] = 0;
+    }
+  }
+
+  for ( long i = 0; i < input_length; ++i ) {
+    if ( i % 4 == 0 ) {
+      result[0] >>= input[i];
+    }
+    else if ( i % 4 == 1 ) {
+      result[1] >>= input[i];
+    }
+    else if ( i % 4 == 2 ) {
+      result[2] >>= input[i];
+    }
+    else if ( i % 4 == 3 ) {
+      result[3] >>= input[i];
+    }
+  }
+
+  delete [] input;
+
+  return (T) result[0];
+}
+
+template< class T, class DeviceType >
+bool RSEqualAtomicViewTest( const long input_length )
+{
+  static_assert( std::is_integral< T >::value, "RSEqualAtomicViewTest: Must be integral type for test" );
+
+  const long remainder = 61042; //prime - 1
+  const long value = 1073741825; //  2^30+1
+  T res       = RSEqualAtomicView< T, DeviceType >( input_length, value, remainder );
+  T resSerial = RSEqualAtomicViewCheck< T >( input_length, value, remainder );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = RSEqualAtomicViewTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//------------atomic view ls-equal------------------
+//---------------------------------------------------
+
+template<class T, class execution_space >
+struct LSEqualAtomicViewFunctor {
+  typedef Kokkos::View< T****, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
+
+  view_type input;
+  atomic_view_type result;
+  const long length;
+  const long value;
+
+  // Wrap the result view in an atomic view, use this for operator.
+  LSEqualAtomicViewFunctor( const view_type & input_, result_view_type & result_, const long & length_, const long & value_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
+    , value( value_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length ) {
+      if ( i % 4 == 0 ) {
+        result( 1, 0, 0, 0 ) <<= input( i );
+      }
+      else if ( i % 4 == 1 ) {
+        result( 0, 1, 0, 0 ) <<= input( i );
+      }
+      else if ( i % 4 == 2 ) {
+        result( 0, 0, 1, 0 ) <<= input( i );
+      }
+      else if ( i % 4 == 3 ) {
+        result( 0, 0, 0, 1 ) <<= input( i );
+      }
+    }
+  }
+};
+
+template< class T, class execution_space >
+T LSEqualAtomicView( const long input_length, const long value, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
+  typedef typename result_view_type::HostMirror host_scalar_view_type;
+
+  const long length = input_length;
+
+  view_type input( "input_view", length );
+  result_view_type result_view( "result_view", 2, 2, 2, 2 );
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+    h_result_view( 1, 0, 0, 0 ) = value;
+    h_result_view( 0, 1, 0, 0 ) = value;
+    h_result_view( 0, 0, 1, 0 ) = value;
+    h_result_view( 0, 0, 0, 1 ) = value;
+  Kokkos::deep_copy( result_view, h_result_view );
+
+  InitFunctor_ModShift< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
+
+  LSEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length, value );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
+  Kokkos::fence();
+
+  Kokkos::deep_copy( h_result_view, result_view );
+
+  return (T) ( h_result_view( 1, 0, 0, 0 ) );
+}
+
+template< class T >
+T LSEqualAtomicViewCheck( const long input_length, const long value, const long remainder ) {
+  T result[4];
+  result[0] = value;
+  result[1] = value;
+  result[2] = value;
+  result[3] = value;
+
+  T * input = new T[input_length];
+  for ( long i = 0; i < input_length; ++i ) {
+    if ( i % ( remainder + 1 ) == remainder ) {
+      input[i] = 1;
+    }
+    else {
+      input[i] = 0;
+    }
+  }
+
+  for ( long i = 0; i < input_length; ++i ) {
+    if ( i % 4 == 0 ) {
+      result[0] <<= input[i];
+    }
+    else if ( i % 4 == 1 ) {
+      result[1] <<= input[i];
+    }
+    else if ( i % 4 == 2 ) {
+      result[2] <<= input[i];
+    }
+    else if ( i % 4 == 3 ) {
+      result[3] <<= input[i];
+    }
+  }
+
+  delete [] input;
+
+  return (T) result[0];
+}
+
+template< class T, class DeviceType >
+bool LSEqualAtomicViewTest( const long input_length )
+{
+  static_assert( std::is_integral< T >::value, "LSEqualAtomicViewTest: Must be integral type for test" );
+
+  const long remainder = 61042; //prime - 1
+  const long value = 1; //  2^30+1
+  T res       = LSEqualAtomicView< T, DeviceType >( input_length, value, remainder );
+  T resSerial = LSEqualAtomicViewCheck< T >( input_length, value, remainder );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = RSEqualAtomicViewTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//-----------atomic view and-equal-----------------
+//---------------------------------------------------
+
+template< class T, class execution_space >
+struct AndEqualAtomicViewFunctor {
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+
+  view_type input;
+  atomic_view_type even_odd_result;
+  const long length;
+
+  // Wrap the result view in an atomic view, use this for operator.
+  AndEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length ) {
+      if ( i % 2 == 0 ) {
+        even_odd_result( 0 ) &= input( i );
+      }
+      else {
+        even_odd_result( 1 ) &= input( i );
+      }
+    }
+  }
+};
+
+template< class T, class execution_space >
+T AndEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  const long length = input_length;
+
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
+  Kokkos::deep_copy( result_view, 1 );
+
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
+
+  AndEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
+  Kokkos::fence();
+
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
+
+  return (T) ( h_result_view( 0 ) );
+}
+
+template< class T >
+T AndEqualAtomicViewCheck( const long input_length ) {
+  const long N = input_length;
+  T result[2] = { 1 };
+  for ( long i = 0; i < N; ++i ) {
+    if ( N % 2 == 0 ) {
+      result[0] &= (T) i;
+    }
+    else {
+      result[1] &= (T) i;
+    }
+  }
+
+  return ( result[0] );
+}
+
+template< class T, class DeviceType >
+bool AndEqualAtomicViewTest( long input_length )
+{
+  static_assert( std::is_integral< T >::value, "AndEqualAtomicViewTest: Must be integral type for test" );
+
+  T res       = AndEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = AndEqualAtomicViewCheck< T >( input_length );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = AndEqualAtomicViewTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//-----------atomic view or-equal-----------------
+//---------------------------------------------------
+
+template< class T, class execution_space >
+struct OrEqualAtomicViewFunctor {
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+
+  view_type input;
+  atomic_view_type even_odd_result;
+  const long length;
+
+  // Wrap the result view in an atomic view, use this for operator.
+  OrEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length ) {
+      if ( i % 2 == 0 ) {
+        even_odd_result( 0 ) |= input( i );
+      }
+      else {
+        even_odd_result( 1 ) |= input( i );
+      }
+    }
+  }
+};
+
+template< class T, class execution_space >
+T OrEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  const long length = input_length;
+
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
+
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
+
+  OrEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
+  Kokkos::fence();
+
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
+
+  return (T) ( h_result_view( 0 ) );
+}
+
+template< class T >
+T OrEqualAtomicViewCheck( const long input_length ) {
+
+  const long N = input_length;
+  T result[2] = { 0 };
+  for ( long i = 0; i < N; ++i ) {
+    if ( i % 2 == 0 ) {
+      result[0] |= (T) i;
+    }
+    else {
+      result[1] |= (T) i;
+    }
+  }
+
+  return (T) ( result[0] );
+}
+
+template< class T, class DeviceType >
+bool OrEqualAtomicViewTest( long input_length )
+{
+  static_assert( std::is_integral< T >::value, "OrEqualAtomicViewTest: Must be integral type for test" );
+
+  T res       = OrEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = OrEqualAtomicViewCheck< T >( input_length );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = OrEqualAtomicViewTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+//---------------------------------------------------
+//-----------atomic view xor-equal-----------------
+//---------------------------------------------------
+
+template< class T, class execution_space >
+struct XOrEqualAtomicViewFunctor {
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+
+  view_type input;
+  atomic_view_type even_odd_result;
+  const long length;
+
+  // Wrap the result view in an atomic view, use this for operator.
+  XOrEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const long i ) const {
+    if ( i < length ) {
+      if ( i % 2 == 0 ) {
+        even_odd_result( 0 ) ^= input( i );
+      }
+      else {
+        even_odd_result( 1 ) ^= input( i );
+      }
+    }
+  }
+};
+
+template< class T, class execution_space >
+T XOrEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  const long length = input_length;
+
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
+
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
+
+  XOrEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
+  Kokkos::fence();
+
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
+
+  return (T) ( h_result_view( 0 ) );
+}
+
+template< class T >
+T XOrEqualAtomicViewCheck( const long input_length ) {
+  const long N = input_length;
+  T result[2] = { 0 };
+  for ( long i = 0; i < N; ++i ) {
+    if ( i % 2 == 0 ) {
+      result[0] ^= (T) i;
+    }
+    else {
+      result[1] ^= (T) i;
+    }
+  }
+
+  return (T) ( result[0] );
+}
+
+template< class T, class DeviceType >
+bool XOrEqualAtomicViewTest( long input_length )
+{
+  static_assert( std::is_integral< T >::value, "XOrEqualAtomicViewTest: Must be integral type for test" );
+
+  T res       = XOrEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = XOrEqualAtomicViewCheck< T >( input_length );
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid( T ).name()
+              << ">( test = XOrEqualAtomicViewTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl;
+  }
+
+  return passed;
+}
+
+// inc/dec?
+
+//---------------------------------------------------
+//--------------atomic_test_control------------------
+//---------------------------------------------------
+
+template< class T, class DeviceType >
+bool AtomicViewsTestIntegralType( const int length, int test )
+{
+  static_assert( std::is_integral< T >::value, "TestAtomicViews Error: Non-integral type passed into IntegralType tests" );
+
+  switch ( test ) {
+    case 1: return PlusEqualAtomicViewTest< T, DeviceType >( length );
+    case 2: return MinusEqualAtomicViewTest< T, DeviceType >( length );
+    case 3: return RSEqualAtomicViewTest< T, DeviceType >( length );
+    case 4: return LSEqualAtomicViewTest< T, DeviceType >( length );
+    case 5: return ModEqualAtomicViewTest< T, DeviceType >( length );
+    case 6: return AndEqualAtomicViewTest< T, DeviceType >( length );
+    case 7: return OrEqualAtomicViewTest< T, DeviceType >( length );
+    case 8: return XOrEqualAtomicViewTest< T, DeviceType >( length );
+  }
+
+  return 0;
+}
+
+template< class T, class DeviceType >
+bool AtomicViewsTestNonIntegralType( const int length, int test )
+{
+  switch ( test ) {
+    case 1: return PlusEqualAtomicViewTest< T, DeviceType >( length );
+    case 2: return MinusEqualAtomicViewTest< T, DeviceType >( length );
+    case 3: return TimesEqualAtomicViewTest< T, DeviceType >( length );
+    case 4: return DivEqualAtomicViewTest< T, DeviceType >( length );
+  }
+
+  return 0;
+}
+
+} // namespace TestAtomicViews
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, atomic_views_integral )
+{
+  const long length = 1000000;
+  {
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, TEST_EXECSPACE >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, TEST_EXECSPACE >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, TEST_EXECSPACE >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, TEST_EXECSPACE >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, TEST_EXECSPACE >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, TEST_EXECSPACE >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, TEST_EXECSPACE >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, TEST_EXECSPACE >( length, 8 ) ) );
+  }
+}
+
+TEST_F( TEST_CATEGORY, atomic_views_nonintegral )
+{
+  const long length = 1000000;
+  {
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, TEST_EXECSPACE >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, TEST_EXECSPACE >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, TEST_EXECSPACE >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, TEST_EXECSPACE >( length, 4 ) ) );
+  }
+}
+
+TEST_F( TEST_CATEGORY, atomic_view_api )
+{
+  TestAtomicViews::TestAtomicViewAPI< int, TEST_EXECSPACE >();
+}
+}
diff --git a/packages/kokkos/core/unit_test/TestCXX11.hpp b/packages/kokkos/core/unit_test/TestCXX11.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6c34d2d4b7e6e68128c3ff81a5782614d2db28c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestCXX11.hpp
@@ -0,0 +1,359 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace TestCXX11 {
+
+template< class DeviceType >
+struct FunctorAddTest {
+  typedef Kokkos::View< double**, DeviceType > view_type;
+  typedef DeviceType execution_space;
+  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member;
+
+  view_type a_, b_;
+
+  FunctorAddTest( view_type & a, view_type & b ) : a_( a ), b_( b ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const int& i ) const {
+    b_( i, 0 ) = a_( i, 1 ) + a_( i, 2 );
+    b_( i, 1 ) = a_( i, 0 ) - a_( i, 3 );
+    b_( i, 2 ) = a_( i, 4 ) + a_( i, 0 );
+    b_( i, 3 ) = a_( i, 2 ) - a_( i, 1 );
+    b_( i, 4 ) = a_( i, 3 ) + a_( i, 4 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const team_member & dev ) const {
+    const int begin = dev.league_rank() * 4;
+    const int end   = begin + 4;
+    for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+      b_( i, 0 ) = a_( i, 1 ) + a_( i, 2 );
+      b_( i, 1 ) = a_( i, 0 ) - a_( i, 3 );
+      b_( i, 2 ) = a_( i, 4 ) + a_( i, 0 );
+      b_( i, 3 ) = a_( i, 2 ) - a_( i, 1 );
+      b_( i, 4 ) = a_( i, 3 ) + a_( i, 4 );
+    }
+  }
+};
+
+template< class DeviceType, bool PWRTest >
+double AddTestFunctor() {
+  typedef Kokkos::TeamPolicy< DeviceType > policy_type;
+
+  Kokkos::View< double**, DeviceType > a( "A", 100, 5 );
+  Kokkos::View< double**, DeviceType > b( "B", 100, 5 );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_a = Kokkos::create_mirror_view( a );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_b = Kokkos::create_mirror_view( b );
+
+  for ( int i = 0; i < 100; i++ ) {
+    for  ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
+  }
+  Kokkos::deep_copy( a, h_a );
+
+  if ( PWRTest == false ) {
+    Kokkos::parallel_for( 100, FunctorAddTest< DeviceType >( a, b ) );
+  }
+  else {
+    Kokkos::parallel_for( policy_type( 25, Kokkos::AUTO ), FunctorAddTest< DeviceType >( a, b ) );
+  }
+  Kokkos::deep_copy( h_b, b );
+
+  double result = 0;
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+      result += h_b( i, j );
+    }
+  }
+
+  return result;
+}
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+template< class DeviceType, bool PWRTest >
+double AddTestLambda() {
+  Kokkos::View< double**, DeviceType > a( "A", 100, 5 );
+  Kokkos::View< double**, DeviceType > b( "B", 100, 5 );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_a = Kokkos::create_mirror_view( a );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_b = Kokkos::create_mirror_view( b );
+
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
+  }
+  Kokkos::deep_copy( a, h_a );
+
+  if ( PWRTest == false ) {
+    Kokkos::parallel_for( 100, KOKKOS_LAMBDA( const int & i ) {
+      b( i, 0 ) = a( i, 1 ) + a( i, 2 );
+      b( i, 1 ) = a( i, 0 ) - a( i, 3 );
+      b( i, 2 ) = a( i, 4 ) + a( i, 0 );
+      b( i, 3 ) = a( i, 2 ) - a( i, 1 );
+      b( i, 4 ) = a( i, 3 ) + a( i, 4 );
+    });
+  }
+  else {
+    typedef Kokkos::TeamPolicy< DeviceType > policy_type;
+    typedef typename policy_type::member_type team_member;
+
+    policy_type policy( 25, Kokkos::AUTO );
+
+    Kokkos::parallel_for( policy, KOKKOS_LAMBDA( const team_member & dev ) {
+      const int begin = dev.league_rank() * 4;
+      const int end   = begin + 4;
+      for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+        b( i, 0 ) = a( i, 1 ) + a( i, 2 );
+        b( i, 1 ) = a( i, 0 ) - a( i, 3 );
+        b( i, 2 ) = a( i, 4 ) + a( i, 0 );
+        b( i, 3 ) = a( i, 2 ) - a( i, 1 );
+        b( i, 4 ) = a( i, 3 ) + a( i, 4 );
+      }
+    });
+  }
+  Kokkos::deep_copy( h_b, b );
+
+  double result = 0;
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+      result += h_b( i, j );
+    }
+  }
+
+  return result;
+}
+#else
+template< class DeviceType, bool PWRTest >
+double AddTestLambda() {
+  return AddTestFunctor< DeviceType, PWRTest >();
+}
+#endif
+
+template< class DeviceType >
+struct FunctorReduceTest {
+  typedef Kokkos::View< double**, DeviceType > view_type;
+  typedef DeviceType execution_space;
+  typedef double value_type;
+  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member;
+
+  view_type a_;
+
+  FunctorReduceTest( view_type & a ) : a_( a ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const int & i, value_type & sum ) const {
+    sum += a_( i, 1 ) + a_( i, 2 );
+    sum += a_( i, 0 ) - a_( i, 3 );
+    sum += a_( i, 4 ) + a_( i, 0 );
+    sum += a_( i, 2 ) - a_( i, 1 );
+    sum += a_( i, 3 ) + a_( i, 4 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const team_member & dev, value_type & sum ) const {
+    const int begin = dev.league_rank() * 4;
+    const int end   = begin + 4;
+    for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+      sum += a_( i, 1 ) + a_( i, 2 );
+      sum += a_( i, 0 ) - a_( i, 3 );
+      sum += a_( i, 4 ) + a_( i, 0 );
+      sum += a_( i, 2 ) - a_( i, 1 );
+      sum += a_( i, 3 ) + a_( i, 4 );
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const { update = 0.0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & update, volatile value_type const & input ) const { update += input; }
+};
+
+template< class DeviceType, bool PWRTest >
+double ReduceTestFunctor() {
+  typedef Kokkos::TeamPolicy< DeviceType > policy_type;
+  typedef Kokkos::View< double**, DeviceType > view_type;
+  typedef Kokkos::View< double, typename view_type::host_mirror_space, Kokkos::MemoryUnmanaged > unmanaged_result;
+
+  view_type a( "A", 100, 5 );
+  typename view_type::HostMirror h_a = Kokkos::create_mirror_view( a );
+
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
+  }
+  Kokkos::deep_copy( a, h_a );
+
+  double result = 0.0;
+  if ( PWRTest == false ) {
+    Kokkos::parallel_reduce( 100, FunctorReduceTest< DeviceType >( a ), unmanaged_result( & result ) );
+  }
+  else {
+    Kokkos::parallel_reduce( policy_type( 25, Kokkos::AUTO ), FunctorReduceTest< DeviceType >( a ), unmanaged_result( & result ) );
+  }
+
+  return result;
+}
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+template< class DeviceType, bool PWRTest >
+double ReduceTestLambda() {
+  typedef Kokkos::TeamPolicy< DeviceType > policy_type;
+  typedef Kokkos::View< double**, DeviceType > view_type;
+  typedef Kokkos::View< double, typename view_type::host_mirror_space, Kokkos::MemoryUnmanaged > unmanaged_result;
+
+  view_type a( "A", 100, 5 );
+  typename view_type::HostMirror h_a = Kokkos::create_mirror_view( a );
+
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
+  }
+  Kokkos::deep_copy( a, h_a );
+
+  double result = 0.0;
+
+  if ( PWRTest == false ) {
+    Kokkos::parallel_reduce( 100, KOKKOS_LAMBDA( const int & i, double & sum ) {
+      sum += a( i, 1 ) + a( i, 2 );
+      sum += a( i, 0 ) - a( i, 3 );
+      sum += a( i, 4 ) + a( i, 0 );
+      sum += a( i, 2 ) - a( i, 1 );
+      sum += a( i, 3 ) + a( i, 4 );
+    }, unmanaged_result( & result ) );
+  }
+  else {
+    typedef typename policy_type::member_type team_member;
+    Kokkos::parallel_reduce( policy_type( 25, Kokkos::AUTO ), KOKKOS_LAMBDA( const team_member & dev, double & sum ) {
+      const int begin = dev.league_rank() * 4;
+      const int end   = begin + 4;
+      for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+        sum += a( i, 1 ) + a( i, 2 );
+        sum += a( i, 0 ) - a( i, 3 );
+        sum += a( i, 4 ) + a( i, 0 );
+        sum += a( i, 2 ) - a( i, 1 );
+        sum += a( i, 3 ) + a( i, 4 );
+      }
+    }, unmanaged_result( & result ) );
+  }
+
+  return result;
+}
+#else
+template< class DeviceType, bool PWRTest >
+double ReduceTestLambda() {
+  return ReduceTestFunctor< DeviceType, PWRTest >();
+}
+#endif
+
+template< class DeviceType >
+double TestVariantLambda( int test ) {
+  switch ( test ) {
+    case 1: return AddTestLambda< DeviceType, false >();
+    case 2: return AddTestLambda< DeviceType, true >();
+    case 3: return ReduceTestLambda< DeviceType, false >();
+    case 4: return ReduceTestLambda< DeviceType, true >();
+  }
+
+  return 0;
+}
+
+template< class DeviceType >
+double TestVariantFunctor( int test ) {
+  switch ( test ) {
+    case 1: return AddTestFunctor< DeviceType, false >();
+    case 2: return AddTestFunctor< DeviceType, true >();
+    case 3: return ReduceTestFunctor< DeviceType, false >();
+    case 4: return ReduceTestFunctor< DeviceType, true >();
+  }
+
+  return 0;
+}
+
+template< class DeviceType >
+bool Test( int test ) {
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+  double res_functor = TestVariantFunctor< DeviceType >( test );
+  double res_lambda = TestVariantLambda< DeviceType >( test );
+
+  char testnames[5][256] = { " "
+                           , "AddTest", "AddTest TeamPolicy"
+                           , "ReduceTest", "ReduceTest TeamPolicy"
+                           };
+  bool passed = true;
+
+  if ( res_functor != res_lambda ) {
+    passed = false;
+
+    std::cout << "CXX11 ( test = '"
+              << testnames[test] << "' FAILED : "
+              << res_functor << " != " << res_lambda
+              << std::endl;
+  }
+
+  return passed;
+#else
+  return true;
+#endif
+}
+
+} // namespace TestCXX11
+
+namespace Test {
+TEST_F( TEST_CATEGORY, cxx11 )
+{
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, TEST_EXECSPACE >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< TEST_EXECSPACE >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< TEST_EXECSPACE >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< TEST_EXECSPACE >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< TEST_EXECSPACE >( 4 ) ) );
+  }
+}
+
+}
+
diff --git a/packages/kokkos/core/unit_test/TestCXX11Deduction.hpp b/packages/kokkos/core/unit_test/TestCXX11Deduction.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..220732918d1b1fc0fd6c0ebf19b920179e27980a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestCXX11Deduction.hpp
@@ -0,0 +1,99 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#ifndef TESTCXX11DEDUCTION_HPP
+#define TESTCXX11DEDUCTION_HPP
+
+namespace TestCXX11 {
+
+struct TestReductionDeductionTagA {};
+struct TestReductionDeductionTagB {};
+
+template < class ExecSpace >
+struct TestReductionDeductionFunctor {
+  // KOKKOS_INLINE_FUNCTION
+  // void operator()( long i, long & value ) const
+  // { value += i + 1; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TestReductionDeductionTagA, long i, long & value ) const
+  { value += ( 2 * i + 1 ) + ( 2 * i + 2 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TestReductionDeductionTagB &, const long i, long & value ) const
+  { value += ( 3 * i + 1 ) + ( 3 * i + 2 ) + ( 3 * i + 3 ); }
+};
+
+template< class ExecSpace >
+void test_reduction_deduction()
+{
+  typedef TestReductionDeductionFunctor< ExecSpace > Functor;
+
+  const long N = 50;
+  // const long answer  = N % 2 ? ( N * ( ( N + 1 ) / 2 ) ) : ( ( N / 2 ) * ( N + 1 ) );
+  const long answerA = N % 2 ? ( ( 2 * N ) * ( ( ( 2 * N ) + 1 ) / 2 ) ) : ( ( ( 2 * N ) / 2 ) * ( ( 2 * N ) + 1 ) );
+  const long answerB = N % 2 ? ( ( 3 * N ) * ( ( ( 3 * N ) + 1 ) / 2 ) ) : ( ( ( 3 * N ) / 2 ) * ( ( 3 * N ) + 1 ) );
+  long result = 0;
+
+  // Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), Functor(), result );
+  // ASSERT_EQ( answer, result );
+
+  Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TestReductionDeductionTagA >( 0, N ), Functor(), result );
+  ASSERT_EQ( answerA, result );
+
+  Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TestReductionDeductionTagB >( 0, N ), Functor(), result );
+  ASSERT_EQ( answerB, result );
+}
+
+} // namespace TestCXX11
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, reduction_deduction )
+{
+  TestCXX11::test_reduction_deduction< TEST_EXECSPACE >();
+}
+}
+#endif
diff --git a/packages/kokkos/core/unit_test/TestCompilerMacros.hpp b/packages/kokkos/core/unit_test/TestCompilerMacros.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e6b5c48d3d790084227a6abd66e9b3a7022d114c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestCompilerMacros.hpp
@@ -0,0 +1,117 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined(KOKKOS_ENABLE_CUDA) && \
+    ( !defined(KOKKOS_ENABLE_CUDA_LAMBDA) || \
+      (  ( defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) ) && \
+         (  (CUDA_VERSION < 8000) && defined( __NVCC__ ))))
+  #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+    #error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA shouldn't be defined"
+  #endif
+#else
+  #if !defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+    #error "Macro bug: KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA should be defined"
+  #endif
+#endif
+
+#define KOKKOS_PRAGMA_UNROLL(a)
+
+namespace TestCompilerMacros {
+
+template< class DEVICE_TYPE >
+struct AddFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View< int**, execution_space > type;
+  type a, b;
+  int length;
+
+  AddFunctor( type a_, type b_ ) : a( a_ ), b( b_ ), length( a.extent(1) ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const {
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+    #pragma unroll
+#endif
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+    #pragma ivdep
+#endif
+#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
+    #pragma vector always
+#endif
+#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
+    #pragma loop count(128)
+#endif
+#ifndef KOKKOS_DEBUG
+#ifdef KOKKOS_ENABLE_PRAGMA_SIMD
+    #pragma simd
+#endif
+#endif
+    for ( int j = 0; j < length; j++ ) {
+      a( i, j ) += b( i, j );
+    }
+  }
+};
+
+template< class DeviceType >
+bool Test() {
+  typedef typename Kokkos::View< int**, DeviceType > type;
+  type a( "A", 1024, 128 );
+  type b( "B", 1024, 128 );
+
+  AddFunctor< DeviceType > f( a, b );
+  Kokkos::parallel_for( 1024, f );
+  DeviceType::fence();
+
+  return true;
+}
+
+} // namespace TestCompilerMacros
+
+namespace Test {
+TEST_F( TEST_CATEGORY, compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< TEST_EXECSPACE >() ) );
+}
+}
diff --git a/packages/kokkos/core/unit_test/TestComplex.hpp b/packages/kokkos/core/unit_test/TestComplex.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..240e16d467d171ec148ccbd1a8841a6a7201a724
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestComplex.hpp
@@ -0,0 +1,260 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<Kokkos_Core.hpp>
+#include<cstdio>
+
+namespace Test {
+
+// Test construction and assignment
+
+template<class ExecSpace>
+struct TestComplexConstruction {
+  Kokkos::View<Kokkos::complex<double>*,ExecSpace> d_results;
+  typename Kokkos::View<Kokkos::complex<double>*,ExecSpace>::HostMirror h_results;
+  
+  void testit () {
+    d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexConstruction",10);
+    h_results = Kokkos::create_mirror_view(d_results);
+   
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0,1), *this);
+    Kokkos::fence();
+    Kokkos::deep_copy(h_results,d_results);
+
+    ASSERT_FLOAT_EQ(h_results(0).real(),1.5);  ASSERT_FLOAT_EQ(h_results(0).imag(),2.5);
+    ASSERT_FLOAT_EQ(h_results(1).real(),1.5);  ASSERT_FLOAT_EQ(h_results(1).imag(),2.5);
+    ASSERT_FLOAT_EQ(h_results(2).real(),0.0);  ASSERT_FLOAT_EQ(h_results(2).imag(),0.0);
+    ASSERT_FLOAT_EQ(h_results(3).real(),3.5);  ASSERT_FLOAT_EQ(h_results(3).imag(),0.0);
+    ASSERT_FLOAT_EQ(h_results(4).real(),4.5);  ASSERT_FLOAT_EQ(h_results(4).imag(),5.5);
+    ASSERT_FLOAT_EQ(h_results(5).real(),1.5);  ASSERT_FLOAT_EQ(h_results(5).imag(),2.5);
+    ASSERT_FLOAT_EQ(h_results(6).real(),4.5);  ASSERT_FLOAT_EQ(h_results(6).imag(),5.5);
+    ASSERT_FLOAT_EQ(h_results(7).real(),7.5);  ASSERT_FLOAT_EQ(h_results(7).imag(),0.0);
+    ASSERT_FLOAT_EQ(h_results(8).real(),double(8));  ASSERT_FLOAT_EQ(h_results(8).imag(),0.0);
+
+#ifndef KOKKOS_ENABLE_ROCM
+    Kokkos::complex<double> a(1.5,2.5),b(3.25,5.25),r_kk;
+    std::complex<double> sa(a),sb(3.25,5.25),r;
+    r = a; r_kk = a;         ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag());
+    r = sb*a; r_kk = b*a;    ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag());
+    r = sa; r_kk = a;        ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag());
+#endif
+
+  }
+
+  KOKKOS_INLINE_FUNCTION 
+  void operator() (const int &i ) const {
+    Kokkos::complex<double> a(1.5,2.5);
+    d_results(0) = a;
+    Kokkos::complex<double> b(a);
+    d_results(1) = b;
+    Kokkos::complex<double> c = Kokkos::complex<double>();
+    d_results(2) = c;
+    Kokkos::complex<double> d(3.5);
+    d_results(3) = d; 
+    volatile Kokkos::complex<double> a_v(4.5,5.5);
+    d_results(4) = a_v;
+    volatile Kokkos::complex<double> b_v(a);
+    d_results(5) = b_v;
+    Kokkos::complex<double> e(a_v);
+    d_results(6) = e;
+
+    d_results(7) = double(7.5);
+    d_results(8) = int(8);
+  } 
+};
+
+TEST_F(TEST_CATEGORY, complex_construction) {
+  TestComplexConstruction<TEST_EXECSPACE> test;
+  test.testit();
+} 
+
+// Test Math FUnction
+
+template<class ExecSpace>
+struct TestComplexBasicMath {
+  Kokkos::View<Kokkos::complex<double>*,ExecSpace> d_results;
+  typename Kokkos::View<Kokkos::complex<double>*,ExecSpace>::HostMirror h_results;
+
+  void testit () {
+    d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",24);
+    h_results = Kokkos::create_mirror_view(d_results);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0,1), *this);
+    Kokkos::fence();
+    Kokkos::deep_copy(h_results,d_results);
+
+    std::complex<double> a(1.5,2.5);
+    std::complex<double> b(3.25,5.75);
+    std::complex<double> d(1.0,2.0);
+    double c = 9.3;
+    int e = 2;
+
+    std::complex<double> r;
+    r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(),  r.imag());
+    r = a-b; ASSERT_FLOAT_EQ(h_results(1).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(1).imag(),  r.imag());
+    r = a*b; ASSERT_FLOAT_EQ(h_results(2).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(2).imag(),  r.imag());
+    r = a/b; ASSERT_FLOAT_EQ(h_results(3).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(3).imag(),  r.imag());
+    r = d+a; ASSERT_FLOAT_EQ(h_results(4).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(4).imag(),  r.imag());
+    r = d-a; ASSERT_FLOAT_EQ(h_results(5).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(5).imag(),  r.imag());
+    r = d*a; ASSERT_FLOAT_EQ(h_results(6).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(6).imag(),  r.imag());
+    r = d/a; ASSERT_FLOAT_EQ(h_results(7).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(7).imag(),  r.imag());
+    r = a+c; ASSERT_FLOAT_EQ(h_results(8).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(8).imag(),  r.imag());
+    r = a-c; ASSERT_FLOAT_EQ(h_results(9).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(9).imag(),  r.imag());
+    r = a*c; ASSERT_FLOAT_EQ(h_results(10).real(), r.real()); ASSERT_FLOAT_EQ(h_results(10).imag(), r.imag());
+    r = a/c; ASSERT_FLOAT_EQ(h_results(11).real(), r.real()); ASSERT_FLOAT_EQ(h_results(11).imag(), r.imag());
+    r = d+c; ASSERT_FLOAT_EQ(h_results(12).real(), r.real()); ASSERT_FLOAT_EQ(h_results(12).imag(), r.imag());
+    r = d-c; ASSERT_FLOAT_EQ(h_results(13).real(), r.real()); ASSERT_FLOAT_EQ(h_results(13).imag(), r.imag());
+    r = d*c; ASSERT_FLOAT_EQ(h_results(14).real(), r.real()); ASSERT_FLOAT_EQ(h_results(14).imag(), r.imag());
+    r = d/c; ASSERT_FLOAT_EQ(h_results(15).real(), r.real()); ASSERT_FLOAT_EQ(h_results(15).imag(), r.imag());
+    r = c+a; ASSERT_FLOAT_EQ(h_results(16).real(), r.real()); ASSERT_FLOAT_EQ(h_results(16).imag(), r.imag());
+    r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag());
+    r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag());
+    r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag());
+
+    r = a; 
+    /* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(),  r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(),  r.imag());
+    /* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(),  r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(),  r.imag());
+    /* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(),  r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(),  r.imag()*e);
+    /* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(),  r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(),  r.imag()/e);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i ) const {
+    Kokkos::complex<double> a(1.5,2.5);
+    Kokkos::complex<double> b(3.25,5.75);
+    // Basic math complex / complex
+    d_results(0) = a+b;
+    d_results(1) = a-b;
+    d_results(2) = a*b;
+    d_results(3) = a/b;
+    d_results(4).real(1.0);
+    d_results(4).imag(2.0);
+    d_results(4) += a;
+    d_results(5) = Kokkos::complex<double>(1.0,2.0);
+    d_results(5) -= a;
+    d_results(6) = Kokkos::complex<double>(1.0,2.0);
+    d_results(6) *= a;
+    d_results(7) = Kokkos::complex<double>(1.0,2.0);
+    d_results(7) /= a;
+
+    // Basic math complex / scalar
+    double c = 9.3;
+    d_results(8) = a+c;
+    d_results(9) = a-c;
+    d_results(10) = a*c;
+    d_results(11) = a/c;
+    d_results(12).real(1.0);
+    d_results(12).imag(2.0);
+    d_results(12) += c;
+    d_results(13) = Kokkos::complex<double>(1.0,2.0);
+    d_results(13) -= c;
+    d_results(14) = Kokkos::complex<double>(1.0,2.0);
+    d_results(14) *= c;
+    d_results(15) = Kokkos::complex<double>(1.0,2.0);
+    d_results(15) /= c;
+
+
+    // Basic math scalar / complex
+    d_results(16) = c+a;
+    d_results(17) = c-a;
+    d_results(18) = c*a;
+    d_results(19) = c/a;
+
+    int e = 2;
+    d_results(20) = a+e;
+    d_results(21) = a-e;
+    d_results(22) = a*e;
+    d_results(23) = a/e;
+  }
+};
+
+TEST_F(TEST_CATEGORY, complex_basic_math) {
+  TestComplexBasicMath<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+
+template<class ExecSpace>
+struct TestComplexSpecialFunctions {
+  Kokkos::View<Kokkos::complex<double>*,ExecSpace> d_results;
+  typename Kokkos::View<Kokkos::complex<double>*,ExecSpace>::HostMirror h_results;
+
+  void testit () {
+    d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexSpecialFunctions",20);
+    h_results = Kokkos::create_mirror_view(d_results);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0,1), *this);
+    Kokkos::fence();
+    Kokkos::deep_copy(h_results,d_results);
+
+    std::complex<double> a(1.5,2.5);
+    double c = 9.3;
+
+    std::complex<double> r;
+    r = a;             ASSERT_FLOAT_EQ(h_results(0).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(),  r.imag());
+    r = std::sqrt(a);  ASSERT_FLOAT_EQ(h_results(1).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(1).imag(),  r.imag());
+    r = std::pow(a,c); ASSERT_FLOAT_EQ(h_results(2).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(2).imag(),  r.imag());
+    r = std::abs(a);   ASSERT_FLOAT_EQ(h_results(3).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(3).imag(),  r.imag());
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i ) const {
+    Kokkos::complex<double> a(1.5,2.5);
+    Kokkos::complex<double> b(3.25,5.75);
+    double c = 9.3;
+
+    d_results(0) = Kokkos::complex<double>(Kokkos::real(a),Kokkos::imag(a));
+    d_results(1) = Kokkos::sqrt(a);
+    d_results(2) = Kokkos::pow(a,c);
+    d_results(3) = Kokkos::abs(a);
+
+  }
+};
+
+TEST_F(TEST_CATEGORY, complex_special_funtions) {
+  TestComplexSpecialFunctions<TEST_EXECSPACE> test;
+  test.testit();
+}
+} // namespace Test
+
+
diff --git a/packages/kokkos/core/unit_test/TestConcurrentBitset.hpp b/packages/kokkos/core/unit_test/TestConcurrentBitset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f479998ede4a6b0b054da947f1cfabd356b60ed6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestConcurrentBitset.hpp
@@ -0,0 +1,177 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TEST_CONCURRENTBITSET_HPP
+#define TEST_CONCURRENTBITSET_HPP
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <impl/Kokkos_ConcurrentBitset.hpp>
+
+namespace Test {
+
+template< class DeviceType >
+struct ConcurrentBitset {
+
+  typedef Kokkos::View<uint32_t*,DeviceType> view_unsigned_type ;
+  typedef Kokkos::View<int*,DeviceType>      view_int_type ;
+
+  view_unsigned_type  bitset ;
+  view_int_type       acquired ;
+  uint32_t            bitset_count_lg2 ;
+  uint32_t            bitset_count_mask ;
+
+  ConcurrentBitset( const uint32_t arg_bitset_count_lg2
+                  , const view_unsigned_type & arg_bitset
+                  , const view_int_type & arg_acquired )
+    : bitset( arg_bitset ), acquired( arg_acquired )
+    , bitset_count_lg2( arg_bitset_count_lg2 )
+    , bitset_count_mask( uint32_t( 1u << arg_bitset_count_lg2 ) - 1 )
+    {}
+
+  struct TagAcquire {};
+  struct TagRelease {};
+  struct TagReacquire {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TagAcquire , int i , long & update ) const
+    {
+      unsigned hint = Kokkos::Impl::clock_tic() & bitset_count_mask ;
+
+      Kokkos::pair<int,int> result =
+        Kokkos::Impl::concurrent_bitset::acquire_bounded_lg2
+          ( bitset.data() , bitset_count_lg2 , hint );
+
+      acquired(i) = result.first ;
+
+      if ( 0 <= result.first ) ++update ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TagRelease , int i , long & update ) const
+    {
+      if ( 0 == ( i % 3 ) && 0 <= acquired(i) ) {
+        Kokkos::Impl::concurrent_bitset::release( bitset.data() , acquired(i) );
+        acquired(i) = -1 ;
+        ++update ;
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TagReacquire , int i , long & update ) const
+    {
+      if ( acquired(i) < 0 ) {
+
+        unsigned hint = Kokkos::Impl::clock_tic() & bitset_count_mask ;
+
+        Kokkos::pair<int,int> result  = Kokkos::Impl::concurrent_bitset::acquire_bounded_lg2
+            ( bitset.data() , bitset_count_lg2 , hint );
+
+        acquired(i) = result.first ;
+
+        if ( 0 <= result.first ) ++update ;
+      }
+    }
+};
+
+template< class DeviceType >
+void test_concurrent_bitset( int bit_count )
+{
+  typedef ConcurrentBitset< DeviceType > Functor ;
+  typedef typename Functor::view_unsigned_type view_unsigned_type ;
+  typedef typename Functor::view_int_type      view_int_type ;
+
+  int bit_count_lg2 = 1 ;
+
+  while ( ( 1 << bit_count_lg2 ) < bit_count ) ++bit_count_lg2 ;
+
+  bit_count = 1 << bit_count_lg2 ;
+
+  const int buffer_length =
+    Kokkos::Impl::concurrent_bitset::buffer_bound_lg2(bit_count_lg2);
+
+  view_unsigned_type bitset("bitset",buffer_length);
+
+  // Try to acquire more than available:
+
+  const size_t n = ( bit_count * 3 ) / 2 ;
+
+  view_int_type acquired("acquired", n );
+
+  typename view_unsigned_type::HostMirror bitset_host =
+    Kokkos::create_mirror_view( bitset );
+
+  Kokkos::deep_copy( bitset , 0u );
+
+  long total = 0 ;
+  long total_release = 0 ;
+  long total_reacquire = 0 ;
+
+  Kokkos::parallel_reduce
+    ( Kokkos::RangePolicy< DeviceType , typename Functor::TagAcquire >(0,n)
+    , Functor( bit_count_lg2 , bitset , acquired )
+    , total );
+
+  ASSERT_EQ( bit_count , total );
+
+  Kokkos::parallel_reduce
+    ( Kokkos::RangePolicy< DeviceType , typename Functor::TagRelease >(0,n)
+    , Functor( bit_count_lg2 , bitset , acquired )
+    , total_release );
+
+  Kokkos::parallel_reduce
+    ( Kokkos::RangePolicy< DeviceType , typename Functor::TagReacquire >(0,n)
+    , Functor( bit_count_lg2 , bitset , acquired )
+    , total_reacquire );
+
+  ASSERT_EQ( total_release , total_reacquire );
+
+}
+
+} // namespace Test
+
+#endif /* #ifndef TEST_CONCURRENTBITSET_HPP */
diff --git a/packages/kokkos/core/unit_test/TestCrs.hpp b/packages/kokkos/core/unit_test/TestCrs.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..77ea508b894c250bc3e1991e74cf72e79b83b355
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestCrs.hpp
@@ -0,0 +1,98 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <vector>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace {
+
+template< class ExecSpace >
+struct CountFillFunctor {
+  KOKKOS_INLINE_FUNCTION
+  std::int32_t operator()(std::int32_t row, std::int32_t* fill) const {
+    auto n = (row % 4) + 1;
+    if (fill) {
+      for (std::int32_t j = 0; j < n; ++j) {
+        fill[j] = j + 1;
+      }
+    }
+    return n;
+  }
+};
+
+template< class ExecSpace >
+void test_count_fill(std::int32_t nrows) {
+  Kokkos::Crs<std::int32_t, ExecSpace, void, std::int32_t> graph;
+  Kokkos::count_and_fill_crs(graph, nrows, CountFillFunctor<ExecSpace>());
+  ASSERT_EQ(graph.numRows(), nrows);
+  auto row_map = Kokkos::create_mirror_view(graph.row_map);
+  Kokkos::deep_copy(row_map, graph.row_map);
+  auto entries = Kokkos::create_mirror_view(graph.entries);
+  Kokkos::deep_copy(entries, graph.entries);
+  for (std::int32_t row = 0; row < nrows; ++row) {
+    auto n = (row % 4) + 1;
+    ASSERT_EQ(row_map(row + 1) - row_map(row), n);
+    for (std::int32_t j = 0; j < n; ++j) {
+      ASSERT_EQ(entries(row_map(row) + j), j + 1);
+    }
+  }
+}
+
+} // anonymous namespace
+
+TEST_F( TEST_CATEGORY, crs_count_fill )
+{
+  test_count_fill<TEST_EXECSPACE>(0);
+  test_count_fill<TEST_EXECSPACE>(1);
+  test_count_fill<TEST_EXECSPACE>(2);
+  test_count_fill<TEST_EXECSPACE>(3);
+  test_count_fill<TEST_EXECSPACE>(13);
+  test_count_fill<TEST_EXECSPACE>(100);
+  test_count_fill<TEST_EXECSPACE>(1000);
+  test_count_fill<TEST_EXECSPACE>(10000);
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7fc879fbf4e44baa77b716a4d5ba567d2782e335
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -0,0 +1,477 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#ifdef KOKKOS_ENABLE_OPENMP
+#include <omp.h>
+#endif
+
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
+
+namespace Test {
+
+namespace Impl {
+
+char** init_kokkos_args( bool do_threads, bool do_numa, bool do_device, bool do_other, int & nargs, Kokkos::InitArguments & init_args ) {
+  nargs = ( do_threads ? 1 : 0 ) +
+          ( do_numa ? 1 : 0 ) +
+          ( do_device ? 1 : 0 ) +
+          ( do_other ? 4 : 0 );
+
+  char** args_kokkos = new char*[nargs];
+  for ( int i = 0; i < nargs; i++ ) {
+    args_kokkos[i] = new char[20];
+  }
+
+  int threads_idx = do_other ? 1 : 0;
+  int numa_idx = ( do_other ? 3 : 0 ) + ( do_threads ? 1 : 0 );
+  int device_idx = ( do_other ? 3 : 0 ) + ( do_threads ? 1 : 0 ) + ( do_numa ? 1 : 0 );
+
+  if ( do_threads ) {
+    int nthreads = 3;
+
+#ifdef KOKKOS_ENABLE_OPENMP
+    if ( omp_get_max_threads() < 3 )
+      nthreads = omp_get_max_threads();
+#endif
+
+    if ( Kokkos::hwloc::available() ) {
+      if ( Kokkos::hwloc::get_available_threads_per_core() < 3 )
+        nthreads =   Kokkos::hwloc::get_available_threads_per_core()
+                   * Kokkos::hwloc::get_available_numa_count();
+    }
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      nthreads = 1;
+    }
+#endif
+
+    init_args.num_threads = nthreads;
+    sprintf( args_kokkos[threads_idx], "--threads=%i", nthreads );
+  }
+
+  if ( do_numa ) {
+    int numa = 1;
+    if ( Kokkos::hwloc::available() ) {
+      numa = Kokkos::hwloc::get_available_numa_count();
+    }
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      numa = 1;
+    }
+#endif
+
+    init_args.num_numa = numa;
+    sprintf( args_kokkos[numa_idx], "--numa=%i", numa );
+  }
+
+  if ( do_device ) {
+    init_args.device_id = 0;
+    sprintf( args_kokkos[device_idx], "--device=%i", 0 );
+  }
+
+  if ( do_other ) {
+    sprintf( args_kokkos[0], "--dummyarg=1" );
+    sprintf( args_kokkos[ threads_idx + ( do_threads ? 1 : 0 ) ], "--dummy2arg" );
+    sprintf( args_kokkos[ threads_idx + ( do_threads ? 1 : 0 ) + 1 ], "dummy3arg" );
+    sprintf( args_kokkos[ device_idx + ( do_device ? 1 : 0 ) ], "dummy4arg=1" );
+  }
+
+  return args_kokkos;
+}
+
+Kokkos::InitArguments init_initstruct( bool do_threads, bool do_numa, bool do_device ) {
+  Kokkos::InitArguments args;
+
+  if ( do_threads ) {
+    int nthreads = 3;
+
+#ifdef KOKKOS_ENABLE_OPENMP
+    if ( omp_get_max_threads() < 3 ) {
+      nthreads = omp_get_max_threads();
+    }
+#endif
+
+    if ( Kokkos::hwloc::available() ) {
+      if ( Kokkos::hwloc::get_available_threads_per_core() < 3 ) {
+        nthreads =   Kokkos::hwloc::get_available_threads_per_core()
+                   * Kokkos::hwloc::get_available_numa_count();
+      }
+    }
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      nthreads = 1;
+    }
+#endif
+
+    args.num_threads = nthreads;
+  }
+
+  if ( do_numa ) {
+    int numa = 1;
+    if ( Kokkos::hwloc::available() ) {
+      numa = Kokkos::hwloc::get_available_numa_count();
+    }
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      numa = 1;
+    }
+#endif
+
+    args.num_numa = numa;
+  }
+
+  if ( do_device ) {
+    args.device_id = 0;
+  }
+
+  return args;
+}
+
+void check_correct_initialization( const Kokkos::InitArguments & argstruct ) {
+  ASSERT_EQ( Kokkos::DefaultExecutionSpace::is_initialized(), 1 );
+  ASSERT_EQ( Kokkos::HostSpace::execution_space::is_initialized(), 1 );
+
+  // Figure out the number of threads the HostSpace ExecutionSpace should have initialized to.
+  int expected_nthreads = argstruct.num_threads;
+
+#ifdef KOKKOS_ENABLE_OPENMP
+  if ( std::is_same< Kokkos::HostSpace::execution_space, Kokkos::OpenMP >::value ) {
+    // use openmp default num threads
+    if ( expected_nthreads < 0 || ( expected_nthreads == 0 && !Kokkos::hwloc::available() ) ) {
+      expected_nthreads = omp_get_max_threads();
+    }
+    // use hwloc if available
+    else if ( expected_nthreads == 0 && Kokkos::hwloc::available() ) {
+      expected_nthreads = Kokkos::hwloc::get_available_numa_count()
+                        * Kokkos::hwloc::get_available_cores_per_numa()
+                        * Kokkos::hwloc::get_available_threads_per_core();
+    }
+  }
+#endif
+
+  if ( expected_nthreads < 1 ) {
+    if ( Kokkos::hwloc::available() ) {
+      expected_nthreads = Kokkos::hwloc::get_available_numa_count()
+                        * Kokkos::hwloc::get_available_cores_per_numa()
+                        * Kokkos::hwloc::get_available_threads_per_core();
+    }
+    else {
+        expected_nthreads = 1;
+    }
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Serial >::value ||
+         std::is_same< Kokkos::DefaultHostExecutionSpace, Kokkos::Serial >::value ) {
+      expected_nthreads = 1;
+    }
+#endif
+  }
+
+  int expected_numa = argstruct.num_numa;
+
+  if ( expected_numa < 1 ) {
+    if ( Kokkos::hwloc::available() ) {
+      expected_numa = Kokkos::hwloc::get_available_numa_count();
+    }
+    else {
+      expected_numa = 1;
+    }
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Serial >::value ||
+         std::is_same< Kokkos::DefaultHostExecutionSpace, Kokkos::Serial >::value )
+      expected_numa = 1;
+#endif
+  }
+
+  ASSERT_EQ( Kokkos::HostSpace::execution_space::thread_pool_size(), expected_nthreads );
+
+
+#ifdef KOKKOS_ENABLE_CUDA
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Cuda >::value ) {
+    int device;
+    cudaGetDevice( &device );
+
+    int expected_device = argstruct.device_id;
+    if ( argstruct.device_id < 0 ) {
+      expected_device = 0;
+    }
+
+    ASSERT_EQ( expected_device, device );
+  }
+#endif
+}
+
+// TODO: Add check whether correct number of threads are actually started.
+void test_no_arguments() {
+  Kokkos::initialize();
+  check_correct_initialization( Kokkos::InitArguments() );
+  Kokkos::finalize();
+}
+
+void test_commandline_args( int nargs, char** args, const Kokkos::InitArguments & argstruct ) {
+  Kokkos::initialize( nargs, args );
+  check_correct_initialization( argstruct );
+  Kokkos::finalize();
+}
+
+void test_initstruct_args( const Kokkos::InitArguments & args ) {
+  Kokkos::initialize( args );
+  check_correct_initialization( args );
+  Kokkos::finalize();
+}
+
+} // namespace Impl
+
+class defaultdevicetypeinit : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {}
+
+  static void TearDownTestCase() {}
+};
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01
+TEST_F( defaultdevicetypeinit, no_args )
+{
+  Impl::test_no_arguments();
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02
+TEST_F( defaultdevicetypeinit, commandline_args_empty )
+{
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args( false, false, false, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
+    delete [] args[i];
+  }
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03
+TEST_F( defaultdevicetypeinit, commandline_args_other )
+{
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args( false, false, false, true, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
+    delete [] args[i];
+  }
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads )
+{
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args( true, false, false, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
+    delete [] args[i];
+  }
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa )
+{
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args( true, true, false, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
+    delete [] args[i];
+  }
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device )
+{
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args( true, true, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
+    delete [] args[i];
+  }
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device )
+{
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args( true, false, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
+    delete [] args[i];
+  }
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08
+TEST_F( defaultdevicetypeinit, commandline_args_numa_device )
+{
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args( false, true, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
+    delete [] args[i];
+  }
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09
+TEST_F( defaultdevicetypeinit, commandline_args_device )
+{
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args( false, false, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
+    delete [] args[i];
+  }
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other )
+{
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args( true, true, true, true, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
+    delete [] args[i];
+  }
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11
+TEST_F( defaultdevicetypeinit, initstruct_default )
+{
+  Kokkos::InitArguments args;
+  Impl::test_initstruct_args( args );
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12
+TEST_F( defaultdevicetypeinit, initstruct_nthreads )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, false, false );
+  Impl::test_initstruct_args( args );
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, true, false );
+  Impl::test_initstruct_args( args );
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14
+TEST_F( defaultdevicetypeinit, initstruct_device )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( false, false, true );
+  Impl::test_initstruct_args( args );
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_device )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, false, true );
+  Impl::test_initstruct_args( args );
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa_device )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, true, true );
+  Impl::test_initstruct_args( args );
+}
+#endif
+
+} // namespace Test
+
+#endif
diff --git a/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1dcbbdb4d0394bf4004ad713182164fa6fd2534
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp
@@ -0,0 +1,153 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TEST_FUNCTOR_ANALYSIS_HPP
+#define TEST_FUNCTOR_ANALYSIS_HPP
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+struct TestFunctorAnalysis_03 {
+
+  struct value_type { double x[2]; };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int , value_type & ) const {}
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type volatile & , value_type const volatile & ) const {}
+
+  KOKKOS_INLINE_FUNCTION static
+  void init( value_type & ) {}
+};
+
+
+template< class ExecSpace >
+void test_functor_analysis()
+{
+  //------------------------------
+  auto c01 = KOKKOS_LAMBDA(int){} ;
+  typedef Kokkos::Impl::FunctorAnalysis
+            < Kokkos::Impl::FunctorPatternInterface::FOR
+            , Kokkos::RangePolicy< ExecSpace >
+            , decltype( c01 ) >
+    A01 ;
+
+  typedef typename A01::template Reducer< typename ExecSpace::memory_space >
+    R01 ;
+
+  static_assert( std::is_same< typename A01::value_type , void >::value , "" );
+  static_assert( std::is_same< typename A01::pointer_type , void >::value , "" );
+  static_assert( std::is_same< typename A01::reference_type , void >::value , "" );
+  static_assert( std::is_same< typename R01::functor_type , decltype(c01) >::value , "" );
+
+  static_assert( ! A01::has_join_member_function , "" );
+  static_assert( ! A01::has_init_member_function , "" );
+  static_assert( ! A01::has_final_member_function , "" );
+  static_assert( A01::StaticValueSize == 0 , "" );
+  ASSERT_EQ( R01( & c01 ).length() , 0 );
+
+  //------------------------------
+  auto c02 = KOKKOS_LAMBDA(int,double&){} ;
+  typedef Kokkos::Impl::FunctorAnalysis
+    < Kokkos::Impl::FunctorPatternInterface::REDUCE
+    , Kokkos::RangePolicy< ExecSpace >
+    , decltype( c02 ) >
+    A02 ;
+  typedef typename A02::template Reducer< typename ExecSpace::memory_space >
+    R02 ;
+
+  static_assert( std::is_same< typename A02::value_type , double >::value , "" );
+  static_assert( std::is_same< typename A02::pointer_type , double * >::value , "" );
+  static_assert( std::is_same< typename A02::reference_type , double & >::value , "" );
+  static_assert( std::is_same< typename R02::functor_type , decltype(c02) >::value , "" );
+
+  static_assert( ! A02::has_join_member_function , "" );
+  static_assert( ! A02::has_init_member_function , "" );
+  static_assert( ! A02::has_final_member_function , "" );
+  static_assert( A02::StaticValueSize == sizeof(double) , "" );
+  ASSERT_EQ( R02( & c02 ).length() , 1 );
+
+  //------------------------------
+  
+  TestFunctorAnalysis_03 c03 ;
+  typedef Kokkos::Impl::FunctorAnalysis
+    < Kokkos::Impl::FunctorPatternInterface::REDUCE
+    , Kokkos::RangePolicy< ExecSpace >
+    , TestFunctorAnalysis_03 > 
+    A03 ;
+  typedef typename A03::template Reducer< typename ExecSpace::memory_space >
+    R03 ;
+
+  static_assert( std::is_same< typename A03::value_type , TestFunctorAnalysis_03::value_type >::value , "" );
+  static_assert( std::is_same< typename A03::pointer_type , TestFunctorAnalysis_03::value_type * >::value , "" );
+  static_assert( std::is_same< typename A03::reference_type , TestFunctorAnalysis_03::value_type & >::value , "" );
+  static_assert( std::is_same< typename R03::functor_type , TestFunctorAnalysis_03 >::value , "" );
+
+  static_assert( A03::has_join_member_function , "" );
+  static_assert( A03::has_init_member_function , "" );
+  static_assert( ! A03::has_final_member_function , "" );
+  static_assert( A03::StaticValueSize == sizeof(TestFunctorAnalysis_03::value_type) , "" );
+  ASSERT_EQ( R03( & c03 ).length() , 1 );
+
+  //------------------------------
+
+  
+
+}
+
+TEST_F( TEST_CATEGORY , functor_analysis )
+{
+  test_functor_analysis< TEST_EXECSPACE >();
+}
+
+}
+
+/*--------------------------------------------------------------------------*/
+
+#endif /* #ifndef TEST_FUNCTOR_ANALYSIS_HPP */
+
diff --git a/packages/kokkos/core/unit_test/TestHWLOC.cpp b/packages/kokkos/core/unit_test/TestHWLOC.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..40a4a441eb4be9034463a75ce1532fac7617239e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestHWLOC.cpp
@@ -0,0 +1,67 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+
+#include <Kokkos_hwloc.hpp>
+
+namespace Test {
+
+class hwloc : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {}
+
+  static void TearDownTestCase() {}
+};
+
+TEST_F( hwloc, query )
+{
+  std::cout << " NUMA[" << Kokkos::hwloc::get_available_numa_count() << "]"
+            << " CORE[" << Kokkos::hwloc::get_available_cores_per_numa() << "]"
+            << " PU[" << Kokkos::hwloc::get_available_threads_per_core()  << "]"
+            << std::endl;
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestInit.hpp b/packages/kokkos/core/unit_test/TestInit.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f3aef7141f47cc2c4afc0fc4efc4ae2c6014016
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestInit.hpp
@@ -0,0 +1,76 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+TEST_F( TEST_CATEGORY, init )
+{
+   ;
+}
+
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+
+template<class ExecSpace>
+void test_dispatch () {
+  const int repeat = 100;
+  for ( int i = 0; i < repeat; ++i ) {
+    for ( int j = 0; j < repeat; ++j ) {
+      Kokkos::parallel_for( Kokkos::RangePolicy< TEST_EXECSPACE >( 0, j )
+                          , KOKKOS_LAMBDA( int ) {} );
+    }
+  }
+}
+
+TEST_F( TEST_CATEGORY, dispatch )
+{
+  test_dispatch<TEST_EXECSPACE>();
+}
+#endif
+
+}
diff --git a/packages/kokkos/core/unit_test/TestMDRange.hpp b/packages/kokkos/core/unit_test/TestMDRange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..48d8c290b5a53e72cdbe388fe5fe0e6c3ebde1ba
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestMDRange.hpp
@@ -0,0 +1,3118 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace {
+
+using namespace Kokkos;
+
+template <typename ExecSpace >
+struct TestMDRange_ReduceArray_2D {
+
+  using DataType       = int;
+  using ViewType_2     = typename Kokkos::View< DataType**, ExecSpace >;
+  using HostViewType_2 = typename ViewType_2::HostMirror;
+
+  ViewType_2 input_view;
+
+  using scalar_type = double;
+  using value_type = scalar_type[];
+  const unsigned value_count;
+
+  TestMDRange_ReduceArray_2D( const int N0, const int N1, const unsigned array_size ) 
+    : input_view( "input_view", N0, N1 ) 
+    , value_count( array_size )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( scalar_type dst[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) {
+      dst[i] = 0.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile scalar_type dst[],
+             const volatile scalar_type src[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) {
+        dst[i] += src[i];
+    }
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j ) const
+  {
+    input_view( i, j ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, value_type lsum ) const
+  {
+    lsum[0] += input_view( i, j ) * 2; //+=6 each time if InitTag => N0*N1*6
+    lsum[1] += input_view( i, j ) ;    //+=3 each time if InitTag => N0*N1*3
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j ) const
+  {
+    input_view( i, j ) = 3;
+  }
+
+  static void test_arrayreduce2( const int N0, const int N1 )
+  {
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, Kokkos::IndexType<int>, InitTag > range_type_init;
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type_init range_init( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+
+      const unsigned array_size = 2;
+
+      TestMDRange_ReduceArray_2D functor( N0, N1, array_size );
+
+      parallel_for( range_init, functor ); // Init the view to 3's
+
+      double sums[ array_size ];
+      parallel_reduce( range, functor, sums );
+
+      // Check output
+      //printf("Array Reduce result. N0 = %d  N1 = %d  N0*N1 = %d  sums[0] = %lf  sums[1] = %lf \n", N0, N1, N0*N1, sums[0], sums[1]);
+
+      ASSERT_EQ( sums[0], 6 * N0 * N1 );
+      ASSERT_EQ( sums[1], 3 * N0 * N1 );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_ReduceArray_3D {
+
+  using DataType       = int;
+  using ViewType_3     = typename Kokkos::View< DataType***, ExecSpace >;
+  using HostViewType_3 = typename ViewType_3::HostMirror;
+
+  ViewType_3 input_view;
+
+  using scalar_type = double;
+  using value_type = scalar_type[];
+  const unsigned value_count;
+
+  TestMDRange_ReduceArray_3D( const int N0, const int N1, const int N2, const unsigned array_size ) 
+    : input_view( "input_view", N0, N1, N2 ) 
+    , value_count( array_size )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( scalar_type dst[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) {
+      dst[i] = 0.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile scalar_type dst[],
+             const volatile scalar_type src[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) {
+        dst[i] += src[i];
+    }
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k ) const
+  {
+    input_view( i, j, k ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, value_type lsum ) const
+  {
+    lsum[0] += input_view( i, j, k ) * 2; //+=6 each time if InitTag => N0*N1*N2*6
+    lsum[1] += input_view( i, j, k ) ;    //+=3 each time if InitTag => N0*N1*N2*3
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k ) const
+  {
+    input_view( i, j, k ) = 3;
+  }
+
+  static void test_arrayreduce3( const int N0, const int N1, const int N2 )
+  {
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3>, Kokkos::IndexType<int>, InitTag > range_type_init;
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type_init range_init( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+
+      const unsigned array_size = 2;
+
+      TestMDRange_ReduceArray_3D functor( N0, N1, N2, array_size );
+
+      parallel_for( range_init, functor ); // Init the view to 3's
+
+      double sums[ array_size ];
+      parallel_reduce( range, functor, sums );
+
+      ASSERT_EQ( sums[0], 6 * N0 * N1 * N2 );
+      ASSERT_EQ( sums[1], 3 * N0 * N1 * N2 );
+    }
+  }
+};
+
+
+template <typename ExecSpace >
+struct TestMDRange_2D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType**, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  using value_type = double;
+
+  TestMDRange_2D( const DataType N0, const DataType N1 ) : input_view( "input_view", N0, N1 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j ) const
+  {
+    input_view( i, j ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, value_type &lsum ) const
+  {
+    lsum += input_view( i, j ) * 2;
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j ) const
+  {
+    input_view( i, j ) = 3;
+  }
+
+  // reduction tagged operators
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, value_type &lsum ) const
+  {
+    lsum += input_view( i, j ) * 3;
+  }
+
+  static void test_reduce2( const int N0, const int N1 )
+  {
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      double sum = 0.0;
+      parallel_reduce( range
+        , KOKKOS_LAMBDA(const int i, const int j, double &lsum) {
+            lsum+=1.0;
+          }
+        , sum );
+      ASSERT_EQ( sum, N0 * N1 );
+    }
+#endif
+#endif
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    // Test with reducers - scalar
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
+      int s0 = 1;
+      int s1 = 1;
+      range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::Experimental::Sum< value_type > reducer_scalar( sum );
+
+      parallel_reduce( range, functor, reducer_scalar );
+
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) );
+    }
+    // Test with reducers - scalar view
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view");
+      sum_view() = sum;
+      Kokkos::Experimental::Sum< value_type > reducer_view( sum_view );
+
+      parallel_reduce( range, functor, reducer_view);
+      sum = sum_view();
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    // Tagged operator test
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 4 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      // check parallel_for results correct with InitTag
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+      ASSERT_EQ( counter, 0 );
+
+
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 9 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+  } // end test_reduce2
+
+  static void test_for2( const int N0, const int N1 )
+  {
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const int s0 = 1;
+      const int s1 = 1;
+
+      range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+
+      TestMDRange_2D::ViewType v( "v", N0, N1 );
+
+      parallel_for( range, 
+          KOKKOS_LAMBDA ( const int i , const int j ) {
+            v(i, j) = 3;
+          }
+        );
+
+      TestMDRange_2D::HostViewType h_view = Kokkos::create_mirror_view( v );
+      Kokkos::deep_copy( h_view, v );
+
+      int counter = 0;
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+#endif
+#endif
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const int s0 = 1;
+      const int s1 = 1;
+      range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, InitTag > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } } );
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Default Layouts + InitTag op() + Default Tile: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "No info: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 4, 4 } } );
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "D D: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "L L: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 7, 7 } } );
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "L R: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 16, 16 } } );
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "R L: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 5, 16 } } );
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "R R: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+  } // end test_for2
+}; // MDRange_2D
+
+template <typename ExecSpace >
+struct TestMDRange_3D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType***, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  using value_type = double;
+
+  TestMDRange_3D( const DataType N0, const DataType N1, const DataType N2 ) : input_view( "input_view", N0, N1, N2 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k ) const
+  {
+    input_view( i, j, k ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, double &lsum ) const
+  {
+    lsum += input_view( i, j, k ) * 2;
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k ) const
+  {
+    input_view( i, j, k ) = 3;
+  }
+
+  // reduction tagged operators
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, value_type &lsum ) const
+  {
+    lsum += input_view( i, j, k ) * 3;
+  }
+
+  static void test_reduce3( const int N0, const int N1, const int N2 )
+  {
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      double sum = 0.0;
+      parallel_reduce( range
+        , KOKKOS_LAMBDA(const int i, const int j, const int k, double &lsum) {
+            lsum+=1.0;
+          }
+        , sum );
+      ASSERT_EQ( sum, N0 * N1 * N2 );
+    }
+#endif
+#endif
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) );
+    }
+
+    // Test with reducers - scalar
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0 }}, {{ N0, N1, N2 }}, {{ 3, 3, 3 }} );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::Experimental::Sum< value_type > reducer_scalar( sum );
+
+      parallel_reduce( range, functor, reducer_scalar );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+    // Test with reducers - scalar view
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0 }}, {{ N0, N1, N2 }}, {{ 3, 3, 3 }} );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view");
+      sum_view() = sum;
+      Kokkos::Experimental::Sum< value_type > reducer_view( sum_view );
+
+      parallel_reduce( range, functor, reducer_view);
+      sum = sum_view();
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    // Tagged operator test
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      // check parallel_for results correct with InitTag
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+      ASSERT_EQ( counter, 0 );
+
+
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 9 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+  } // end test_reduce3
+
+  static void test_for3( const int N0, const int N1, const int N2 )
+  {
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const int s0 = 1;
+      const int s1 = 1;
+      const int s2 = 1;
+
+      range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+
+      TestMDRange_3D::ViewType v( "v", N0, N1, N2 );
+
+      parallel_for( range, 
+          KOKKOS_LAMBDA ( const int i , const int j , const int k ) {
+            v(i, j, k) = 3;
+          }
+        );
+
+      TestMDRange_3D::HostViewType h_view = Kokkos::create_mirror_view( v );
+      Kokkos::deep_copy( h_view, v );
+
+      int counter = 0;
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+#endif
+#endif
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3> > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 5, 7 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 8, 8, 8 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  } // end test_for3
+};
+
+template <typename ExecSpace >
+struct TestMDRange_4D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType****, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  using value_type = double;
+
+  TestMDRange_4D( const DataType N0, const DataType N1, const DataType N2, const DataType N3 ) : input_view( "input_view", N0, N1, N2, N3 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l ) const
+  {
+    input_view( i, j, k, l ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, double &lsum ) const
+  {
+    lsum += input_view( i, j, k, l ) * 2;
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l ) const
+  {
+    input_view( i, j, k, l ) = 3;
+  }
+
+  // reduction tagged operators
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, value_type &lsum ) const
+  {
+    lsum += input_view( i, j, k, l ) * 3;
+  }
+
+  static void test_reduce4( const int N0, const int N1, const int N2, const int N3 )
+  {
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
+      double sum = 0.0;
+      parallel_reduce( range
+        , KOKKOS_LAMBDA(const int i, const int j, const int k, const int l, double &lsum) {
+            lsum+=1.0;
+          }
+        , sum );
+      ASSERT_EQ( sum, N0 * N1 * N2 * N3 );
+    }
+#endif
+#endif
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) );
+    }
+
+    // Test with reducers - scalar
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0 }}, {{ N0, N1, N2, N3 }}, {{ 3, 3, 3, 3 }} );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::Experimental::Sum< value_type > reducer_scalar( sum );
+
+      parallel_reduce( range, functor, reducer_scalar );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    // Test with reducers - scalar view
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0 }}, {{ N0, N1, N2, N3 }}, {{ 3, 3, 3, 3 }} );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view");
+      sum_view() = sum;
+      Kokkos::Experimental::Sum< value_type > reducer_view( sum_view );
+
+      parallel_reduce( range, functor, reducer_view);
+      sum = sum_view();
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    // Tagged operator test
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      // check parallel_for results correct with InitTag
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_reduce4 parallel_for init; mismatches = %d\n\n", counter );
+      }
+      ASSERT_EQ( counter, 0 );
+
+
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+  } // end test_reduce
+
+
+
+  static void test_for4( const int N0, const int N1, const int N2, const int N3 )
+  {
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const int s0 = 1;
+      const int s1 = 1;
+      const int s2 = 1;
+      const int s3 = 1;
+
+      range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
+
+      TestMDRange_4D::ViewType v( "v", N0, N1, N2, N3 );
+
+      parallel_for( range, 
+          KOKKOS_LAMBDA ( const int i , const int j , const int k, const int l ) {
+            v(i, j, k, l) = 3;
+          }
+        );
+
+      TestMDRange_4D::HostViewType h_view = Kokkos::create_mirror_view( v );
+      Kokkos::deep_copy( h_view, v );
+
+      int counter = 0;
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+#endif
+#endif
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4> > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } } );
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  } // end test_for4
+};
+
+template <typename ExecSpace >
+struct TestMDRange_5D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType*****, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  using value_type = double;
+
+  TestMDRange_5D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4 ) : input_view( "input_view", N0, N1, N2, N3, N4 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m ) const
+  {
+    input_view( i, j, k, l, m ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const
+  {
+    lsum += input_view( i, j, k, l, m ) * 2;
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m ) const
+  {
+    input_view( i, j, k, l, m ) = 3;
+  }
+
+  // reduction tagged operators
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const
+  {
+    lsum += input_view( i, j, k, l, m ) * 3;
+  }
+
+  static void test_reduce5( const int N0, const int N1, const int N2, const int N3, const int N4 )
+  {
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 1 } } );
+      double sum = 0.0;
+      parallel_reduce( range
+        , KOKKOS_LAMBDA(const int i, const int j, const int k, const int l, const int m, double &lsum) {
+            lsum+=1.0;
+          }
+        , sum );
+      ASSERT_EQ( sum, N0 * N1 * N2 * N3 * N4 );
+    }
+#endif
+#endif
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) );
+    }
+
+    // Test with reducers - scalar
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4 }}, {{ 3, 3, 3, 3, 3 }} );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::Experimental::Sum< value_type > reducer_scalar( sum );
+
+      parallel_reduce( range, functor, reducer_scalar );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
+    }
+
+    // Test with reducers - scalar view
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4 }}, {{ 3, 3, 3, 3, 3 }} );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view");
+      sum_view() = sum;
+      Kokkos::Experimental::Sum< value_type > reducer_view( sum_view );
+
+      parallel_reduce( range, functor, reducer_view);
+      sum = sum_view();
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
+    }
+
+    // Tagged operator test
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 2, 4, 6, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      // check parallel_for results correct with InitTag
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_reduce5 parallel_for init; mismatches = %d\n\n", counter );
+      }
+      ASSERT_EQ( counter, 0 );
+
+
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 * N4 );
+    }
+  }
+
+  static void test_for5( const int N0, const int N1, const int N2, const int N3, const int N4 )
+  {
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const int s0 = 1;
+      const int s1 = 1;
+      const int s2 = 1;
+      const int s3 = 1;
+      const int s4 = 1;
+
+      range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 1 } } );
+
+      TestMDRange_5D::ViewType v( "v", N0, N1, N2, N3, N4 );
+
+      parallel_for( range, 
+          KOKKOS_LAMBDA ( const int i , const int j , const int k, const int l, const int m ) {
+            v(i, j, k, l, m) = 3;
+          }
+        );
+
+      TestMDRange_5D::HostViewType h_view = Kokkos::create_mirror_view( v );
+      Kokkos::deep_copy( h_view, v );
+
+      int counter = 0;
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+#endif
+#endif
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5> > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } } );
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_6D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType******, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  using value_type = double;
+
+  TestMDRange_6D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4, const DataType N5 ) : input_view( "input_view", N0, N1, N2, N3, N4, N5 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, const int n ) const
+  {
+    input_view( i, j, k, l, m, n ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const
+  {
+    lsum += input_view( i, j, k, l, m, n ) * 2;
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m, const int n ) const
+  {
+    input_view( i, j, k, l, m, n ) = 3;
+  }
+
+  // reduction tagged operators
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const
+  {
+    lsum += input_view( i, j, k, l, m, n ) * 3;
+  }
+
+  static void test_reduce6( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 )
+  {
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 1, 1 } } );
+      double sum = 0.0;
+      parallel_reduce( range
+        , KOKKOS_LAMBDA(const int i, const int j, const int k, const int l, const int m, const int n, double &lsum) {
+            lsum+=1.0;
+          }
+        , sum );
+      ASSERT_EQ( sum, N0 * N1 * N2 * N3 * N4 * N5 );
+    }
+#endif
+#endif
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      int s5 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) );
+    }
+
+    // Test with reducers - scalar
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4, N5 }}, {{ 3, 3, 3, 3, 3, 2 }} );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::Experimental::Sum< value_type > reducer_scalar( sum );
+
+      parallel_reduce( range, functor, reducer_scalar );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
+    }
+
+    // Test with reducers - scalar view
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4, N5 }}, {{ 3, 3, 3, 3, 3, 2 }} );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view");
+      sum_view() = sum;
+      Kokkos::Experimental::Sum< value_type > reducer_view( sum_view );
+
+      parallel_reduce( range, functor, reducer_view);
+      sum = sum_view();
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
+    }
+
+    // Tagged operator test
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 2, 4, 6, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      // check parallel_for results correct with InitTag
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_reduce6 parallel_for init; mismatches = %d\n\n", counter );
+      }
+      ASSERT_EQ( counter, 0 );
+
+
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 * N4 * N5 );
+    }
+  }
+
+  static void test_for6( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 )
+  {
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const int s0 = 1;
+      const int s1 = 1;
+      const int s2 = 1;
+      const int s3 = 1;
+      const int s4 = 1;
+      const int s5 = 1;
+
+      range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 1, 1 } } );
+
+      TestMDRange_6D::ViewType v( "v", N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, 
+          KOKKOS_LAMBDA ( const int i , const int j , const int k, const int l, const int m, const int n ) {
+            v(i, j, k, l, m, n) = 3;
+          }
+        );
+
+      TestMDRange_6D::HostViewType h_view = Kokkos::create_mirror_view( v );
+      Kokkos::deep_copy( h_view, v );
+
+      int counter = 0;
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
+      for ( int n = s5; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+#endif
+#endif
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6> > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } } );
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      int s5 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
+      for ( int n = s5; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  }
+};
+
+
+template <typename ExecSpace >
+struct TestMDRange_2D_NegIdx {
+
+  using value_type = double;
+
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType**, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  DataType lower_offset[2];
+
+  TestMDRange_2D_NegIdx( const DataType L0, const DataType L1, const DataType N0, const DataType N1 ) : input_view( "input_view", N0 - L0, N1 - L1 ) 
+  {
+    lower_offset[0] = L0;
+    lower_offset[1] = L1;
+  }
+
+  // When using negative indices, must offset View appropriately as views cannot take a negative index
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j ) const
+  {
+    input_view( i - lower_offset[0], j - lower_offset[1] ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, value_type &lsum ) const
+  {
+    lsum += input_view( i - lower_offset[0], j - lower_offset[1] ) * 2;
+  }
+
+  static void test_2D_negidx( const int N0, const int N1 )
+  {
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const point_type lower{{-1, -1}};
+      const point_type upper{{N0, N1}};
+      const tile_type  tile{{8,8}};
+
+      range_type range( point_type{{ lower[0], lower[1] }}, point_type{{ upper[0], upper[1] }}, tile_type{{ tile[0], tile[1] }} );
+
+      TestMDRange_2D_NegIdx functor( lower[0], lower[1], upper[0], upper[1] );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_3D_NegIdx {
+
+  using value_type = double;
+
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType***, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  DataType lower_offset[3];
+
+  TestMDRange_3D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType N0, const DataType N1, const DataType N2 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2 ) 
+  {
+    lower_offset[0] = L0;
+    lower_offset[1] = L1;
+    lower_offset[2] = L2;
+  }
+
+  // When using negative indices, must offset View appropriately as views cannot take a negative index
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k ) const
+  {
+    input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2] ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, value_type &lsum ) const
+  {
+    lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2] ) * 2;
+  }
+
+  static void test_3D_negidx( const int N0, const int N1, const int N2 )
+  {
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const point_type lower{{-1, -1, -1}};
+      const point_type upper{{N0, N1, N2}};
+      const tile_type  tile{{8,8,2}};
+
+      range_type range( point_type{{ lower[0], lower[1], lower[2] }}, point_type{{ upper[0], upper[1], upper[2] }}, tile_type{{ tile[0], tile[1], tile[2] }} );
+
+      TestMDRange_3D_NegIdx functor( lower[0], lower[1], lower[2], upper[0], upper[1], upper[2] );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_4D_NegIdx {
+
+  using value_type = double;
+
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType****, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  DataType lower_offset[4];
+
+  TestMDRange_4D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType N0, const DataType N1, const DataType N2, const DataType N3 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3 ) 
+  {
+    lower_offset[0] = L0;
+    lower_offset[1] = L1;
+    lower_offset[2] = L2;
+    lower_offset[3] = L3;
+  }
+
+  // When using negative indices, must offset View appropriately as views cannot take a negative index
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l ) const
+  {
+    input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3] ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, value_type &lsum ) const
+  {
+    lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3] ) * 2;
+  }
+
+  static void test_4D_negidx( const int N0, const int N1, const int N2, const int N3 )
+  {
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<4>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const point_type lower{{-1, -1, -1, -1}};
+      const point_type upper{{N0, N1, N2, N3}};
+      const tile_type  tile{{8,8,2,2}};
+
+      range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3] }}, point_type{{ upper[0], upper[1], upper[2], upper[3] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3] }} );
+
+      TestMDRange_4D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], upper[0], upper[1], upper[2], upper[3] );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_5D_NegIdx {
+
+  using value_type = double;
+
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType*****, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  DataType lower_offset[5];
+
+  TestMDRange_5D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType L4, const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3, N4 - L4 ) 
+  {
+    lower_offset[0] = L0;
+    lower_offset[1] = L1;
+    lower_offset[2] = L2;
+    lower_offset[3] = L3;
+    lower_offset[4] = L4;
+  }
+
+  // When using negative indices, must offset View appropriately as views cannot take a negative index
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m ) const
+  {
+    input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4] ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const
+  {
+    lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4] ) * 2;
+  }
+
+  static void test_5D_negidx( const int N0, const int N1, const int N2, const int N3, const int N4 )
+  {
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<5>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const point_type lower{{-1, -1, -1, -1, -1}};
+      const point_type upper{{N0, N1, N2, N3, N4}};
+      const tile_type  tile{{8,4,2,2,2}};
+
+      range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3], lower[4] }}, point_type{{ upper[0], upper[1], upper[2], upper[3], upper[4] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3], tile[4] }} );
+
+      TestMDRange_5D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], lower[4], upper[0], upper[1], upper[2], upper[3], upper[4] );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) * (upper[4] - lower[4]) );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_6D_NegIdx {
+
+  using value_type = double;
+
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType******, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+  DataType lower_offset[6];
+
+  TestMDRange_6D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType L4, const DataType L5, const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4, const DataType N5 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3, N4 - L4, N5 - L5 ) 
+  {
+    lower_offset[0] = L0;
+    lower_offset[1] = L1;
+    lower_offset[2] = L2;
+    lower_offset[3] = L3;
+    lower_offset[4] = L4;
+    lower_offset[5] = L5;
+  }
+
+  // When using negative indices, must offset View appropriately as views cannot take a negative index
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, const int n ) const
+  {
+    input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4], n - lower_offset[5] ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const
+  {
+    lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4], n - lower_offset[5] ) * 2;
+  }
+
+  static void test_6D_negidx( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 )
+  {
+
+    {
+      typedef typename Kokkos::MDRangePolicy< ExecSpace, Kokkos::Rank<6>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      const point_type lower{{-1, -1, -1, -1, -1, -1}};
+      const point_type upper{{N0, N1, N2, N3, N4, N5}};
+      const tile_type  tile{{8,4,2,2,2,1}};
+
+      range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3], lower[4], lower[5] }}, point_type{{ upper[0], upper[1], upper[2], upper[3], upper[4], upper[5] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3], tile[4], tile[5] }} );
+
+      TestMDRange_6D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], lower[4], lower[5], upper[0], upper[1], upper[2], upper[3], upper[4], upper[5] );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) * (upper[4] - lower[4]) * (upper[5] - lower[5]) );
+    }
+  }
+};
+
+
+} // namespace
+
+TEST_F( TEST_CATEGORY , mdrange_for ) {
+  TestMDRange_2D< TEST_EXECSPACE >::test_for2( 100, 100 );
+  TestMDRange_3D< TEST_EXECSPACE >::test_for3( 100, 10, 100 );
+  TestMDRange_4D< TEST_EXECSPACE >::test_for4( 100, 10, 10, 10 );
+  TestMDRange_5D< TEST_EXECSPACE >::test_for5( 100, 10, 10, 10, 5 );
+  TestMDRange_6D< TEST_EXECSPACE >::test_for6( 10, 10, 10, 10, 5, 5 );
+}
+
+TEST_F( TEST_CATEGORY , mdrange_reduce ) {
+  TestMDRange_2D< TEST_EXECSPACE >::test_reduce2( 100, 100 );
+  TestMDRange_3D< TEST_EXECSPACE >::test_reduce3( 100, 10, 100 );
+  TestMDRange_4D< TEST_EXECSPACE >::test_reduce4( 100, 10, 10, 10 );
+  TestMDRange_5D< TEST_EXECSPACE >::test_reduce5( 100, 10, 10, 10, 5 );
+  TestMDRange_6D< TEST_EXECSPACE >::test_reduce6( 100, 10, 10, 10, 5, 5 );
+}
+
+//#ifndef KOKKOS_ENABLE_CUDA
+TEST_F( TEST_CATEGORY , mdrange_array_reduce ) {
+  TestMDRange_ReduceArray_2D< TEST_EXECSPACE >::test_arrayreduce2( 4, 5 );
+  TestMDRange_ReduceArray_3D< TEST_EXECSPACE >::test_arrayreduce3( 4, 5, 10 );
+}
+
+TEST_F( TEST_CATEGORY , mdrange_neg_idx ) {
+  TestMDRange_2D_NegIdx< TEST_EXECSPACE >::test_2D_negidx( 128, 32 );
+  TestMDRange_3D_NegIdx< TEST_EXECSPACE >::test_3D_negidx( 128, 32, 8 );
+  TestMDRange_4D_NegIdx< TEST_EXECSPACE >::test_4D_negidx( 128, 32, 8, 8 );
+  TestMDRange_5D_NegIdx< TEST_EXECSPACE >::test_5D_negidx( 128, 32, 8, 8, 4 );
+  TestMDRange_6D_NegIdx< TEST_EXECSPACE >::test_6D_negidx( 128, 32, 8, 8, 4, 2 );
+}
+//#endif
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestMemoryPool.hpp b/packages/kokkos/core/unit_test/TestMemoryPool.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9fb1d900f7580a384ab16754a1b13316ef94cf4f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestMemoryPool.hpp
@@ -0,0 +1,635 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_UNITTEST_MEMPOOL_HPP
+#define KOKKOS_UNITTEST_MEMPOOL_HPP
+
+#include <cstdio>
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+
+#include <impl/Kokkos_Timer.hpp>
+
+namespace TestMemoryPool {
+
+template< typename MemSpace = Kokkos::HostSpace >
+void test_host_memory_pool_defaults()
+{
+  typedef typename MemSpace::execution_space   Space ;
+  typedef typename Kokkos::MemoryPool< Space > MemPool ;
+
+  {
+    const size_t MemoryCapacity = 32000 ;
+    const size_t MinBlockSize   =    64 ;
+    const size_t MaxBlockSize   =  1024 ;
+    const size_t SuperBlockSize =  4096 ;
+
+    MemPool pool( MemSpace()
+                , MemoryCapacity
+                , MinBlockSize
+                , MaxBlockSize
+                , SuperBlockSize
+                );
+
+    typename MemPool::usage_statistics stats ;
+
+    pool.get_usage_statistics( stats );
+
+    ASSERT_LE( MemoryCapacity , stats.capacity_bytes );
+    ASSERT_LE( MinBlockSize , stats.min_block_bytes );
+    ASSERT_LE( MaxBlockSize , stats.max_block_bytes );
+    ASSERT_LE( SuperBlockSize , stats.superblock_bytes );
+  }
+
+  {
+    const size_t MemoryCapacity = 10000 ;
+
+    MemPool pool( MemSpace()
+                , MemoryCapacity
+                );
+
+    typename MemPool::usage_statistics stats ;
+
+    pool.get_usage_statistics( stats );
+
+    ASSERT_LE( MemoryCapacity , stats.capacity_bytes );
+    ASSERT_LE( 64u /* default */ , stats.min_block_bytes );
+    ASSERT_LE( stats.min_block_bytes , stats.max_block_bytes );
+    ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes );
+    ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes );
+  }
+
+  {
+    const size_t MemoryCapacity = 10000 ;
+    const size_t MinBlockSize   =    32 ; // power of two is exact
+
+    MemPool pool( MemSpace()
+                , MemoryCapacity
+                , MinBlockSize
+                );
+
+    typename MemPool::usage_statistics stats ;
+
+    pool.get_usage_statistics( stats );
+
+    ASSERT_LE( MemoryCapacity , stats.capacity_bytes );
+    ASSERT_EQ( MinBlockSize , stats.min_block_bytes );
+    ASSERT_LE( stats.min_block_bytes , stats.max_block_bytes );
+    ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes );
+    ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes );
+  }
+
+  {
+    const size_t MemoryCapacity = 32000 ;
+    const size_t MinBlockSize   =    32 ; // power of two is exact
+    const size_t MaxBlockSize   =  1024 ; // power of two is exact
+
+    MemPool pool( MemSpace()
+                , MemoryCapacity
+                , MinBlockSize
+                , MaxBlockSize
+                );
+
+    typename MemPool::usage_statistics stats ;
+
+    pool.get_usage_statistics( stats );
+
+    ASSERT_LE( MemoryCapacity , stats.capacity_bytes );
+    ASSERT_EQ( MinBlockSize , stats.min_block_bytes );
+    ASSERT_EQ( MaxBlockSize , stats.max_block_bytes );
+    ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes );
+    ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes );
+  }
+}
+
+template< typename MemSpace = Kokkos::HostSpace >
+void test_host_memory_pool_stats()
+{
+  typedef typename MemSpace::execution_space   Space ;
+  typedef typename Kokkos::MemoryPool< Space > MemPool ;
+
+  const size_t MemoryCapacity = 32000 ;
+  const size_t MinBlockSize   =    64 ;
+  const size_t MaxBlockSize   =  1024 ;
+  const size_t SuperBlockSize =  4096 ;
+
+  MemPool pool( MemSpace()
+              , MemoryCapacity
+              , MinBlockSize
+              , MaxBlockSize
+              , SuperBlockSize
+              );
+
+  {
+    typename MemPool::usage_statistics stats ;
+
+    pool.get_usage_statistics( stats );
+
+    ASSERT_LE( MemoryCapacity , stats.capacity_bytes );
+    ASSERT_LE( MinBlockSize , stats.min_block_bytes );
+    ASSERT_LE( MaxBlockSize , stats.max_block_bytes );
+    ASSERT_LE( SuperBlockSize , stats.superblock_bytes );
+  }
+
+  void * p0064 = pool.allocate(64);
+  void * p0128 = pool.allocate(128);
+  void * p0256 = pool.allocate(256);
+  void * p1024 = pool.allocate(1024);
+
+  // Aborts because exceeds max block size:
+  // void * p2048 = pool.allocate(2048);
+
+  ASSERT_NE( p0064 , (void*) 0 );
+  ASSERT_NE( p0128 , (void*) 0 );
+  ASSERT_NE( p0256 , (void*) 0 );
+  ASSERT_NE( p1024 , (void*) 0 );
+
+  pool.deallocate( p0064 , 64 );
+  pool.deallocate( p0128 , 128 );
+  pool.deallocate( p0256 , 256 );
+  pool.deallocate( p1024 , 1024 );
+
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class DeviceType >
+struct TestMemoryPool_Functor {
+
+  typedef Kokkos::View< uintptr_t * , DeviceType >         ptrs_type ;
+  typedef Kokkos::MemoryPool< DeviceType > pool_type ;
+
+  pool_type pool ;
+  ptrs_type ptrs ;
+
+  TestMemoryPool_Functor( const pool_type & arg_pool , size_t n )
+    : pool( arg_pool )
+    , ptrs( "ptrs" , n )
+    {}
+
+  // Specify reduction argument value_type to avoid
+  // confusion with tag-dispatch.
+
+  using value_type = long ;
+
+  struct TagAlloc {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TagAlloc , int i , long & update ) const noexcept
+    {
+      unsigned alloc_size = 32 * ( 1 + ( i % 5 ));
+      ptrs(i) = (uintptr_t)  pool.allocate( alloc_size );
+      if ( ptrs(i) ) { ++update ; }
+    }
+
+  struct TagDealloc {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TagDealloc , int i , long & update ) const noexcept
+    {
+      if ( ptrs(i) && ( 0 == i % 3 ) ) {
+        unsigned alloc_size = 32 * ( 1 + ( i % 5 ));
+        pool.deallocate( (void*) ptrs(i) , alloc_size );
+        ptrs(i) = 0 ;
+        ++update ;
+      }
+    }
+
+  struct TagRealloc {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TagRealloc , int i , long & update ) const noexcept
+    {
+      if ( 0 == ptrs(i) ) {
+        unsigned alloc_size = 32 * ( 1 + ( i % 5 ));
+        ptrs(i) = (uintptr_t)  pool.allocate( alloc_size );
+        if ( ptrs(i) ) { ++update ; }
+      }
+    }
+
+  struct TagMixItUp {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TagMixItUp , int i , long & update ) const noexcept
+    {
+      if ( ptrs(i) && ( 0 == i % 3 ) ) {
+
+        unsigned alloc_size = 32 * ( 1 + ( i % 5 ));
+
+        pool.deallocate( (void*) ptrs(i) , alloc_size );
+
+        ptrs(i) = (uintptr_t)  pool.allocate( alloc_size );
+
+        if ( ptrs(i) ) { ++update ; }
+      }
+    }
+};
+
+template< class PoolType >
+void print_memory_pool_stats
+  ( typename PoolType::usage_statistics const & stats )
+{
+  std::cout << "MemoryPool {" << std::endl
+            << "  bytes capacity = " << stats.capacity_bytes << std::endl
+            << "  bytes used     = " << stats.consumed_bytes << std::endl
+            << "  bytes reserved = " << stats.reserved_bytes << std::endl
+            << "  bytes free     = " << ( stats.capacity_bytes -
+               ( stats.consumed_bytes + stats.reserved_bytes ) ) << std::endl
+            << "  block used     = " << stats.consumed_blocks << std::endl
+            << "  block reserved = " << stats.reserved_blocks << std::endl
+            << "  super used     = " << stats.consumed_superblocks << std::endl
+            << "  super reserved = " << ( stats.capacity_superblocks -
+                                    stats.consumed_superblocks ) << std::endl
+            << "}" << std::endl ;
+}
+
+template< class DeviceType >
+void test_memory_pool_v2( const bool print_statistics
+                        , const bool print_superblocks )
+{
+  typedef typename DeviceType::memory_space     memory_space ;
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef Kokkos::MemoryPool< DeviceType > pool_type ;
+  typedef TestMemoryPool_Functor< DeviceType > functor_type ;
+
+  typedef typename functor_type::TagAlloc   TagAlloc ;
+  typedef typename functor_type::TagDealloc TagDealloc ;
+  typedef typename functor_type::TagRealloc TagRealloc ;
+  typedef typename functor_type::TagMixItUp TagMixItUp ;
+
+  const size_t    total_alloc_size = 10000000 ;
+  const unsigned  min_block_size   = 64 ;
+  const unsigned  max_block_size   = 256 ;
+  const long      nfill            = 70000 ;
+
+  for ( uint32_t k = 0 , min_superblock_size = 10000 ;
+        k < 3 ; ++k , min_superblock_size *= 10 ) {
+
+    typename pool_type::usage_statistics stats ;
+
+    pool_type pool( memory_space()
+                  , total_alloc_size
+                  , min_block_size
+                  , max_block_size
+                  , min_superblock_size );
+
+    functor_type functor(pool,nfill);
+
+    long result = 0 ;
+    long ndel  = 0 ;
+
+    Kokkos::parallel_reduce
+      ( Kokkos::RangePolicy< execution_space , TagAlloc >(0,nfill)
+      , functor
+      , result
+      );
+
+    pool.get_usage_statistics( stats );
+
+    const int fill_error = ( nfill != result ) ||
+                           ( nfill != long(stats.consumed_blocks) );
+
+    if ( fill_error || print_statistics ) print_memory_pool_stats< pool_type >( stats );
+    if ( fill_error || print_superblocks ) pool.print_state( std::cout );
+
+    ASSERT_EQ( nfill , result );
+    ASSERT_EQ( nfill , long(stats.consumed_blocks) );
+
+    Kokkos::parallel_reduce
+      ( Kokkos::RangePolicy< execution_space , TagDealloc >(0,nfill)
+      , functor
+      , ndel
+      );
+
+    pool.get_usage_statistics( stats );
+
+    const int del_error = ( nfill - ndel ) != long(stats.consumed_blocks);
+
+    if ( del_error || print_statistics ) print_memory_pool_stats< pool_type >( stats );
+    if ( del_error || print_superblocks ) pool.print_state( std::cout );
+
+    ASSERT_EQ( ( nfill - ndel ) , long(stats.consumed_blocks) );
+
+    Kokkos::parallel_reduce
+      ( Kokkos::RangePolicy< execution_space , TagRealloc >(0,nfill)
+      , functor
+      , result
+      );
+
+    pool.get_usage_statistics( stats );
+
+    const int refill_error = ( ndel != result ) ||
+                             ( nfill != long(stats.consumed_blocks) );
+
+    if ( refill_error || print_statistics ) print_memory_pool_stats< pool_type >( stats );
+    if ( refill_error || print_superblocks ) pool.print_state( std::cout );
+
+    ASSERT_EQ( ndel , result );
+    ASSERT_EQ( nfill , long(stats.consumed_blocks) );
+
+    Kokkos::parallel_reduce
+      ( Kokkos::RangePolicy< execution_space , TagMixItUp >(0,nfill)
+      , functor
+      , result
+      );
+
+    pool.get_usage_statistics( stats );
+
+    const int mix_error = ( ndel != result ) ||
+                          ( nfill != long(stats.consumed_blocks) );
+
+    if ( mix_error || print_statistics ) print_memory_pool_stats< pool_type >( stats );
+    if ( mix_error || print_superblocks ) pool.print_state( std::cout );
+
+    ASSERT_EQ( ndel , result );
+    ASSERT_EQ( nfill , long(stats.consumed_blocks) );
+  }
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class DeviceType >
+struct TestMemoryPoolCorners {
+
+  typedef Kokkos::View< uintptr_t * , DeviceType >  ptrs_type ;
+  typedef Kokkos::MemoryPool< DeviceType >          pool_type ;
+
+  pool_type pool ;
+  ptrs_type ptrs ;
+  uint32_t  size ;
+  uint32_t  stride ;
+
+  TestMemoryPoolCorners( const pool_type & arg_pool
+                       , const ptrs_type & arg_ptrs
+                       , const uint32_t arg_base
+                       , const uint32_t arg_stride
+                       )
+    : pool( arg_pool )
+    , ptrs( arg_ptrs )
+    , size( arg_base )
+    , stride( arg_stride )
+    {}
+
+  // Specify reduction argument value_type to
+  // avoid confusion with tag-dispatch.
+
+  using value_type = long ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , long & err ) const noexcept
+    {
+      unsigned alloc_size = size << ( i % stride );
+      if ( 0 == ptrs(i) ) {
+        ptrs(i) = (uintptr_t) pool.allocate( alloc_size );
+        if ( ptrs(i) && ! alloc_size ) { ++err ; }
+      }
+    }
+
+  struct TagDealloc {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const noexcept
+    {
+      unsigned alloc_size = size << ( i % stride );
+      if ( ptrs(i) ) { pool.deallocate( (void*) ptrs(i) , alloc_size ); }
+      ptrs(i) = 0 ;
+    }
+};
+
+template< class DeviceType >
+void test_memory_pool_corners( const bool print_statistics
+                             , const bool print_superblocks )
+{
+  typedef typename DeviceType::memory_space     memory_space ;
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef Kokkos::MemoryPool< DeviceType >      pool_type ;
+  typedef TestMemoryPoolCorners< DeviceType >   functor_type ;
+  typedef typename functor_type::ptrs_type      ptrs_type ;
+
+  {
+    // superblock size 1 << 14 
+    const size_t  min_superblock_size = 1u << 14 ;
+
+    // four superblocks
+    const size_t total_alloc_size = min_superblock_size * 4 ;
+
+    // block sizes  {  64 , 128 , 256 , 512 }
+    // block counts { 256 , 128 ,  64 ,  32 }
+    const unsigned  min_block_size  = 64 ;
+    const unsigned  max_block_size  = 512 ;
+    const unsigned  num_blocks      = 480 ;
+
+    pool_type pool( memory_space()
+                  , total_alloc_size
+                  , min_block_size
+                  , max_block_size
+                  , min_superblock_size );
+
+    // Allocate one block from each superblock to lock that
+    // superblock into the block size.
+
+    ptrs_type ptrs("ptrs",num_blocks);
+
+    long err = 0 ;
+
+    Kokkos::parallel_reduce
+      ( Kokkos::RangePolicy< execution_space >(0,4)
+      , functor_type( pool , ptrs , 64 , 4 )
+      , err
+      );
+
+    if ( print_statistics || err ) {
+
+      typename pool_type::usage_statistics stats ;
+
+      pool.get_usage_statistics( stats );
+
+      print_memory_pool_stats< pool_type >( stats );
+    }
+
+    if ( print_superblocks || err ) {
+      pool.print_state( std::cout );
+    }
+
+    // Now fill remaining allocations with small size
+
+    Kokkos::parallel_reduce
+      ( Kokkos::RangePolicy< execution_space >(0,num_blocks)
+      , functor_type( pool , ptrs , 64 , 1 )
+      , err
+      );
+
+    if ( print_statistics || err ) {
+
+      typename pool_type::usage_statistics stats ;
+
+      pool.get_usage_statistics( stats );
+
+      print_memory_pool_stats< pool_type >( stats );
+    }
+
+    if ( print_superblocks || err ) {
+      pool.print_state( std::cout );
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class DeviceType , class Enable = void >
+struct TestMemoryPoolHuge
+{
+  TestMemoryPoolHuge() {}
+
+  enum : size_t { num_superblock = 0 };
+
+  using value_type = long ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , long & err ) const noexcept {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const noexcept {}
+};
+
+template< class DeviceType >
+struct TestMemoryPoolHuge< DeviceType
+                         , typename std::enable_if<
+                           std::is_same< Kokkos::HostSpace
+                                       , typename DeviceType::memory_space >
+                             ::value >::type
+                         >
+{
+  typedef Kokkos::View< uintptr_t * , DeviceType >  ptrs_type ;
+  typedef Kokkos::MemoryPool< DeviceType >          pool_type ;
+  typedef typename DeviceType::memory_space         memory_space ;
+
+  pool_type pool ;
+  ptrs_type ptrs ;
+
+  enum : size_t { min_block_size = 512
+                , max_block_size = 1lu << 31
+                , min_superblock_size = max_block_size
+                , num_superblock = 4 
+                , total_alloc_size = num_superblock * max_block_size };
+
+  TestMemoryPoolHuge()
+    : pool( memory_space()
+           , total_alloc_size
+           , min_block_size
+           , max_block_size
+           , min_superblock_size )
+    , ptrs( "ptrs" , num_superblock )
+    {}
+
+  // Specify reduction argument value_type to
+  // avoid confusion with tag-dispatch.
+
+  using value_type = long ;
+
+  void operator()( int i , long & err ) const noexcept
+    {
+      if ( i < int(num_superblock) ) {
+        ptrs(i) = (uintptr_t) pool.allocate( max_block_size );
+#if 0
+        printf("TestMemoryPoolHuge size(0x%lx) ptr(0x%lx)\n"
+              , max_block_size
+              , ptrs(i) );
+#endif
+        if ( ! ptrs(i) ) {
+          Kokkos::abort("TestMemoryPoolHuge");
+          ++err ;
+        }
+      }
+    }
+
+  void operator()( int i ) const noexcept
+    {
+      if ( i < int(num_superblock) ) {
+        pool.deallocate( (void*) ptrs(i) , max_block_size );
+        ptrs(i) = 0 ;
+      }
+    }
+};
+
+template< class DeviceType >
+void test_memory_pool_huge()
+{
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef TestMemoryPoolHuge< DeviceType >      functor_type ;
+  typedef Kokkos::RangePolicy< execution_space > policy_type ;
+
+  functor_type f ;
+  policy_type policy( 0 , functor_type::num_superblock );
+
+  long err = 0 ;
+
+  Kokkos::parallel_reduce( policy , f , err );
+  Kokkos::parallel_for( policy , f );
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+} // namespace TestMemoryPool
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, memory_pool )
+{
+  TestMemoryPool::test_host_memory_pool_defaults<>();
+  TestMemoryPool::test_host_memory_pool_stats<>();
+  TestMemoryPool::test_memory_pool_v2< TEST_EXECSPACE >(false,false);
+  TestMemoryPool::test_memory_pool_corners< TEST_EXECSPACE >(false,false);
+  TestMemoryPool::test_memory_pool_huge< TEST_EXECSPACE >();
+}
+
+}
+
+#endif
+
diff --git a/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8329cbd395d15c99dc55a92635c50169b0ad6eba
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
@@ -0,0 +1,694 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+namespace Test {
+struct SomeTag {};
+
+template< class ExecutionSpace >
+class TestRangePolicyConstruction {
+public:
+  TestRangePolicyConstruction() {
+    test_compile_time_parameters();
+    test_runtime_parameters();
+  }
+
+private:
+  void test_compile_time_parameters() {
+    {
+      Kokkos::Impl::expand_variadic();
+      Kokkos::Impl::expand_variadic( 1, 2, 3 );
+    }
+
+    {
+      typedef Kokkos::RangePolicy<> policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, ExecutionSpace, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+
+    {
+      typedef Kokkos::RangePolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+  }
+  void test_runtime_parameters() {
+    {
+      typedef Kokkos::RangePolicy<> policy_t;
+      policy_t p(5,15);
+      ASSERT_TRUE( (p.begin() == 5) );
+      ASSERT_TRUE( (p.end() == 15) );
+    }
+    {
+      typedef Kokkos::RangePolicy<> policy_t;
+      policy_t p(Kokkos::DefaultExecutionSpace(),5,15);
+      ASSERT_TRUE( (p.begin() == 5) );
+      ASSERT_TRUE( (p.end() == 15) );
+    }
+    {
+      typedef Kokkos::RangePolicy<> policy_t;
+      policy_t p(5,15,Kokkos::ChunkSize(10));
+      ASSERT_TRUE( (p.begin() == 5) );
+      ASSERT_TRUE( (p.end() == 15) );
+      ASSERT_TRUE( (p.chunk_size() == 10) );
+    }
+    {
+      typedef Kokkos::RangePolicy<> policy_t;
+      policy_t p(Kokkos::DefaultExecutionSpace(),5,15,Kokkos::ChunkSize(10));
+      ASSERT_TRUE( (p.begin() == 5) );
+      ASSERT_TRUE( (p.end() == 15) );
+      ASSERT_TRUE( (p.chunk_size() == 10) );
+    }
+  }
+};
+
+template< class ExecutionSpace >
+class TestTeamPolicyConstruction {
+public:
+  TestTeamPolicyConstruction() {
+    test_compile_time_parameters();
+    test_run_time_parameters();
+  }
+
+private:
+  void test_compile_time_parameters() {
+    {
+      typedef Kokkos::TeamPolicy<> policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, ExecutionSpace, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace        >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+
+    {
+      typedef Kokkos::TeamPolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
+    }
+  }
+
+
+  template< class policy_t >
+  void test_run_time_parameters_type() {
+    int league_size = 131;
+    int team_size = 4 < policy_t::execution_space::concurrency() ? 4 : policy_t::execution_space::concurrency();
+    int chunk_size = 4;
+    int per_team_scratch = 1024;
+    int per_thread_scratch = 16;
+    int scratch_size = per_team_scratch + per_thread_scratch * team_size;
+    int vector_length = 4;
+
+    policy_t p1( league_size, team_size );
+    ASSERT_EQ  ( p1.league_size(),     league_size                    );
+    ASSERT_EQ  ( p1.team_size(),       team_size                      );
+    ASSERT_TRUE( p1.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p1.scratch_size( 0 ), 0                              );
+
+    policy_t p2 = p1.set_chunk_size( chunk_size );
+    ASSERT_EQ  ( p1.league_size(),     league_size                    );
+    ASSERT_EQ  ( p1.team_size(),       team_size                      );
+    ASSERT_TRUE( p1.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p1.scratch_size( 0 ), 0                              );
+
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+
+    policy_t p3 = p2.set_scratch_size( 0, Kokkos::PerTeam( per_team_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p3.league_size(),     league_size                    );
+    ASSERT_EQ  ( p3.team_size(),       team_size                      );
+    ASSERT_EQ  ( p3.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p3.scratch_size( 0 ), per_team_scratch               );
+
+    policy_t p4 = p2.set_scratch_size( 0, Kokkos::PerThread( per_thread_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p4.league_size(),     league_size                    );
+    ASSERT_EQ  ( p4.team_size(),       team_size                      );
+    ASSERT_EQ  ( p4.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p4.scratch_size( 0 ), per_thread_scratch * team_size );
+
+    policy_t p5 = p2.set_scratch_size( 0, Kokkos::PerThread( per_thread_scratch ), Kokkos::PerTeam( per_team_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p5.league_size(),     league_size                    );
+    ASSERT_EQ  ( p5.team_size(),       team_size                      );
+    ASSERT_EQ  ( p5.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p5.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p6 = p2.set_scratch_size( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p6.league_size(),     league_size                    );
+    ASSERT_EQ  ( p6.team_size(),       team_size                      );
+    ASSERT_EQ  ( p6.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p6.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p7 = p3.set_scratch_size( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) );
+    ASSERT_EQ  ( p3.league_size(),     league_size                    );
+    ASSERT_EQ  ( p3.team_size(),       team_size                      );
+    ASSERT_EQ  ( p3.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p3.scratch_size( 0 ), per_team_scratch               );
+    ASSERT_EQ  ( p7.league_size(),     league_size                    );
+    ASSERT_EQ  ( p7.team_size(),       team_size                      );
+    ASSERT_EQ  ( p7.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p7.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p8(league_size, team_size, Kokkos::ChunkSize(chunk_size) );
+    ASSERT_EQ  ( p8.league_size(),     league_size                    );
+    ASSERT_EQ  ( p8.team_size(),       team_size                      );
+    ASSERT_EQ  ( p8.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p8.scratch_size( 0 ), 0                              );
+
+    policy_t p10( league_size, team_size, Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ) ) );
+    ASSERT_EQ  ( p10.league_size(),     league_size                    );
+    ASSERT_EQ  ( p10.team_size(),       team_size                      );
+    ASSERT_TRUE( p10.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p10.scratch_size( 0 ), per_team_scratch               );
+
+    policy_t p11( league_size, team_size, Kokkos::ScratchRequest( 0, Kokkos::PerThread( per_thread_scratch ) ) );
+    ASSERT_EQ  ( p11.league_size(),     league_size                    );
+    ASSERT_EQ  ( p11.team_size(),       team_size                      );
+    ASSERT_TRUE( p11.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p11.scratch_size( 0 ), per_thread_scratch * team_size );
+
+    policy_t p12( league_size, team_size, Kokkos::ScratchRequest( 0, Kokkos::PerThread( per_thread_scratch ), Kokkos::PerTeam( per_team_scratch ) ) );
+    ASSERT_EQ  ( p12.league_size(),     league_size                    );
+    ASSERT_EQ  ( p12.team_size(),       team_size                      );
+    ASSERT_TRUE( p12.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p12.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p13( league_size, team_size, Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) ) );
+    ASSERT_EQ  ( p13.league_size(),     league_size                    );
+    ASSERT_EQ  ( p13.team_size(),       team_size                      );
+    ASSERT_TRUE( p13.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p13.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p14( league_size, team_size, Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) ) );
+    ASSERT_EQ  ( p14.league_size(),     league_size                    );
+    ASSERT_EQ  ( p14.team_size(),       team_size                      );
+    ASSERT_TRUE( p14.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p14.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p15( league_size, team_size, Kokkos::ChunkSize(chunk_size), Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ) ) );
+    ASSERT_EQ  ( p15.league_size(),     league_size                    );
+    ASSERT_EQ  ( p15.team_size(),       team_size                      );
+    ASSERT_TRUE( p15.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p15.scratch_size( 0 ), per_team_scratch               );
+
+    policy_t p16( league_size, team_size, Kokkos::ScratchRequest( 0, Kokkos::PerThread( per_thread_scratch ) ), Kokkos::ChunkSize(chunk_size) );
+    ASSERT_EQ  ( p16.league_size(),     league_size                    );
+    ASSERT_EQ  ( p16.team_size(),       team_size                      );
+    ASSERT_EQ  ( p16.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p16.scratch_size( 0 ), per_thread_scratch * team_size );
+
+    policy_t p17( league_size, team_size, Kokkos::ChunkSize(chunk_size), Kokkos::ScratchRequest( 0, Kokkos::PerThread( per_thread_scratch ), Kokkos::PerTeam( per_team_scratch ) ) );
+    ASSERT_EQ  ( p17.league_size(),     league_size                    );
+    ASSERT_EQ  ( p17.team_size(),       team_size                      );
+    ASSERT_EQ  ( p17.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p17.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p18( league_size, team_size, Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) ), Kokkos::ChunkSize(chunk_size) );
+    ASSERT_EQ  ( p18.league_size(),     league_size                    );
+    ASSERT_EQ  ( p18.team_size(),       team_size                      );
+    ASSERT_EQ  ( p18.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p18.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p19( league_size, team_size, Kokkos::ChunkSize(chunk_size), Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) ) );
+    ASSERT_EQ  ( p19.league_size(),     league_size                    );
+    ASSERT_EQ  ( p19.team_size(),       team_size                      );
+    ASSERT_EQ  ( p19.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p19.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p20( league_size, team_size, vector_length, Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ) ) );
+    ASSERT_EQ  ( p20.league_size(),     league_size                    );
+    ASSERT_EQ  ( p20.team_size(),       team_size                      );
+    ASSERT_TRUE( p20.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p20.scratch_size( 0 ), per_team_scratch               );
+
+    policy_t p21( league_size, team_size, vector_length, Kokkos::ScratchRequest( 0, Kokkos::PerThread( per_thread_scratch ) ) );
+    ASSERT_EQ  ( p21.league_size(),     league_size                    );
+    ASSERT_EQ  ( p21.team_size(),       team_size                      );
+    ASSERT_TRUE( p21.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p21.scratch_size( 0 ), per_thread_scratch * team_size );
+
+    policy_t p22( league_size, team_size, vector_length, Kokkos::ScratchRequest( 0, Kokkos::PerThread( per_thread_scratch ), Kokkos::PerTeam( per_team_scratch ) ) );
+    ASSERT_EQ  ( p22.league_size(),     league_size                    );
+    ASSERT_EQ  ( p22.team_size(),       team_size                      );
+    ASSERT_TRUE( p22.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p22.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p23( league_size, team_size, (size_t) vector_length, Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) ) );
+    ASSERT_EQ  ( p23.league_size(),     league_size                    );
+    ASSERT_EQ  ( p23.team_size(),       team_size                      );
+    ASSERT_TRUE( p23.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p23.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p24( league_size, team_size, (size_t) vector_length, Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) ) );
+    ASSERT_EQ  ( p24.league_size(),     league_size                    );
+    ASSERT_EQ  ( p24.team_size(),       team_size                      );
+    ASSERT_TRUE( p24.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p24.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p25( league_size, team_size, vector_length, Kokkos::ChunkSize(chunk_size), Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ) ) );
+    ASSERT_EQ  ( p25.league_size(),     league_size                    );
+    ASSERT_EQ  ( p25.team_size(),       team_size                      );
+    ASSERT_TRUE( p25.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p25.scratch_size( 0 ), per_team_scratch               );
+
+    policy_t p26( league_size, team_size, vector_length, Kokkos::ScratchRequest( 0, Kokkos::PerThread( per_thread_scratch ) ), Kokkos::ChunkSize(chunk_size) );
+    ASSERT_EQ  ( p26.league_size(),     league_size                    );
+    ASSERT_EQ  ( p26.team_size(),       team_size                      );
+    ASSERT_EQ  ( p26.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p26.scratch_size( 0 ), per_thread_scratch * team_size );
+
+    policy_t p27( league_size, team_size, vector_length, Kokkos::ChunkSize(chunk_size), Kokkos::ScratchRequest( 0, Kokkos::PerThread( per_thread_scratch ), Kokkos::PerTeam( per_team_scratch ) ) );
+    ASSERT_EQ  ( p27.league_size(),     league_size                    );
+    ASSERT_EQ  ( p27.team_size(),       team_size                      );
+    ASSERT_EQ  ( p27.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p27.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p28( league_size, team_size, (size_t) vector_length, Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) ), Kokkos::ChunkSize(chunk_size) );
+    ASSERT_EQ  ( p28.league_size(),     league_size                    );
+    ASSERT_EQ  ( p28.team_size(),       team_size                      );
+    ASSERT_EQ  ( p28.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p28.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p29( league_size, team_size, (size_t) vector_length, Kokkos::ChunkSize(chunk_size), Kokkos::ScratchRequest( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) ) );
+    ASSERT_EQ  ( p29.league_size(),     league_size                    );
+    ASSERT_EQ  ( p29.team_size(),       team_size                      );
+    ASSERT_EQ  ( p29.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p29.scratch_size( 0 ), scratch_size                   );
+
+  }
+
+  void test_run_time_parameters() {
+    test_run_time_parameters_type< Kokkos::TeamPolicy<ExecutionSpace> >();
+    test_run_time_parameters_type< Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > >();
+    test_run_time_parameters_type< Kokkos::TeamPolicy<Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > >();
+    test_run_time_parameters_type< Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, ExecutionSpace, SomeTag > >();
+  }
+};
+
+TEST_F( TEST_CATEGORY, policy_construction )
+{
+   TestRangePolicyConstruction< TEST_EXECSPACE >();
+   TestTeamPolicyConstruction< TEST_EXECSPACE >();
+}
+
+}
diff --git a/packages/kokkos/core/unit_test/TestRange.hpp b/packages/kokkos/core/unit_test/TestRange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..85c270887dbc0be38e0d2ebb114ce4f7cf28f230
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestRange.hpp
@@ -0,0 +1,321 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace {
+
+template< class ExecSpace, class ScheduleType >
+struct TestRange {
+  typedef int value_type; ///< typedef required for the parallel_reduce
+
+  typedef Kokkos::View< int*, ExecSpace > view_type;
+
+  view_type m_flags;
+
+  struct VerifyInitTag {};
+  struct ResetTag {};
+  struct VerifyResetTag {};
+
+  int N; 
+  TestRange( const size_t N_ )
+    : m_flags( Kokkos::ViewAllocateWithoutInitializing( "flags" ), N_ ), N(N_)
+    {}
+
+  void test_for()
+  {
+
+    typename view_type::HostMirror host_flags = Kokkos::create_mirror_view( m_flags );
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), *this );
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+    {
+      typedef TestRange< ExecSpace, ScheduleType > ThisType;
+      std::string label("parallel_for");
+      Kokkos::Impl::ParallelConstructName< ThisType, void> pcn(label);
+      ASSERT_EQ( pcn.get(), label );
+      std::string empty_label("");
+      Kokkos::Impl::ParallelConstructName< ThisType, void> empty_pcn(empty_label);
+      ASSERT_EQ( empty_pcn.get(), typeid(ThisType).name() );
+    }
+#endif
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, VerifyInitTag >( 0, N ), *this );
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+    {
+      typedef TestRange< ExecSpace, ScheduleType > ThisType;
+      std::string label("parallel_for");
+      Kokkos::Impl::ParallelConstructName< ThisType, VerifyInitTag> pcn(label);
+      ASSERT_EQ( pcn.get(), label );
+      std::string empty_label("");
+      Kokkos::Impl::ParallelConstructName< ThisType, VerifyInitTag> empty_pcn(empty_label);
+      ASSERT_EQ( empty_pcn.get(), std::string(typeid(ThisType).name()) + "/" + typeid(VerifyInitTag).name() );
+    }
+#endif
+
+    Kokkos::deep_copy( host_flags, m_flags );
+
+    int error_count = 0;
+    for ( int i = 0; i < N; ++i ) {
+      if ( int( i ) != host_flags( i ) ) ++error_count;
+    }
+    ASSERT_EQ( error_count, int( 0 ) );
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, ResetTag >( 0, N ), *this );
+    Kokkos::parallel_for( std::string( "TestKernelFor" ), Kokkos::RangePolicy< ExecSpace, ScheduleType, VerifyResetTag >( 0, N ), *this );
+
+    Kokkos::deep_copy( host_flags, m_flags );
+
+    error_count = 0;
+    for ( int i = 0; i < N; ++i ) {
+      if ( int( 2 * i ) != host_flags( i ) ) ++error_count;
+    }
+    ASSERT_EQ( error_count, int( 0 ) );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i ) const
+  { m_flags( i )  = i; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const VerifyInitTag &, const int i ) const
+  {
+    if ( i != m_flags( i ) ) {
+      printf( "TestRange::test_for error at %d != %d\n", i, m_flags( i ) );
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const ResetTag &, const int i ) const
+  { m_flags( i ) = 2 * m_flags( i ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const VerifyResetTag &, const int i ) const
+  {
+    if ( 2 * i != m_flags( i ) )
+    {
+      printf( "TestRange::test_for error at %d != %d\n", i, m_flags( i ) );
+    }
+  }
+
+  //----------------------------------------
+
+  struct OffsetTag {};
+
+  void test_reduce( )
+  {
+    int total = 0;
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), *this );
+
+    Kokkos::parallel_reduce( "TestKernelReduce", Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), *this, total );
+    // sum( 0 .. N-1 )
+    ASSERT_EQ( size_t( ( N - 1 ) * ( N ) / 2 ), size_t( total ) );
+
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( 0, N ), *this, total );
+    // sum( 1 .. N )
+    ASSERT_EQ( size_t( ( N ) * ( N + 1 ) / 2 ), size_t( total ) );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, value_type & update ) const
+  { update += m_flags( i ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const OffsetTag &, const int i, value_type & update ) const
+  { update += 1 + m_flags( i ); }
+
+  //----------------------------------------
+
+  void test_scan( )
+  {
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), *this );
+
+    Kokkos::parallel_scan( "TestKernelScan", Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( 0, N ), *this );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const OffsetTag &, const int i, value_type & update, bool final ) const
+  {
+    update += m_flags( i );
+
+    if ( final ) {
+      if ( update != ( i * ( i + 1 ) ) / 2 ) {
+        printf( "TestRange::test_scan error %d : %d != %d\n", i, ( i * ( i + 1 ) ) / 2, m_flags( i ) );
+      }
+    }
+  }
+
+  void test_dynamic_policy()
+  {
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) 
+   #if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+     typedef Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+
+    {
+      Kokkos::View< size_t*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > count( "Count", ExecSpace::concurrency() );
+      Kokkos::View< int*, ExecSpace > a( "A", N );
+
+      Kokkos::parallel_for( policy_t( 0, N ), KOKKOS_LAMBDA ( const int& i ) {
+        for ( int k = 0; k < ( i < N / 2 ? 1 : 10000 ); k++ ) {
+          a( i )++;
+        }
+        count( ExecSpace::hardware_thread_id() )++;
+      });
+
+      int error = 0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), KOKKOS_LAMBDA( const int & i, int & lsum ) {
+        lsum += ( a( i ) != ( i < N / 2 ? 1 : 10000 ) );
+      }, error );
+      ASSERT_EQ( error, 0 );
+
+      if ( ( ExecSpace::concurrency() > (int) 1 ) && ( N > static_cast<int>( 4 * ExecSpace::concurrency() ) ) ) {
+        size_t min = N;
+        size_t max = 0;
+        for ( int t = 0; t < ExecSpace::concurrency(); t++ ) {
+          if ( count( t ) < min ) min = count( t );
+          if ( count( t ) > max ) max = count( t );
+        }
+        ASSERT_TRUE( min < max );
+
+        //if ( ExecSpace::concurrency() > 2 ) {
+        //  ASSERT_TRUE( 2 * min < max );
+        //}
+      }
+    }
+
+    {
+      Kokkos::View< size_t*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > count( "Count", ExecSpace::concurrency() );
+      Kokkos::View< int*, ExecSpace> a( "A", N );
+
+      int sum = 0;
+      Kokkos::parallel_reduce( policy_t( 0, N ), KOKKOS_LAMBDA( const int & i, int & lsum ) {
+        for ( int k = 0; k < ( i < N / 2 ? 1 : 10000 ); k++ ) {
+          a( i )++;
+        }
+        count( ExecSpace::hardware_thread_id() )++;
+        lsum++;
+      }, sum );
+      ASSERT_EQ( sum, N );
+
+      int error = 0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), KOKKOS_LAMBDA( const int & i, int & lsum ) {
+        lsum += ( a( i ) != ( i < N / 2 ? 1 : 10000 ) );
+      }, error );
+      ASSERT_EQ( error, 0 );
+
+      if ( ( ExecSpace::concurrency() > (int) 1 ) && ( N > static_cast<int>( 4 * ExecSpace::concurrency() ) ) ) {
+        size_t min = N;
+        size_t max = 0;
+        for ( int t = 0; t < ExecSpace::concurrency(); t++ ) {
+          if ( count( t ) < min ) min = count( t );
+          if ( count( t ) > max ) max = count( t );
+        }
+        ASSERT_TRUE( min < max );
+
+        //if ( ExecSpace::concurrency() > 2 ) {
+        //  ASSERT_TRUE( 2 * min < max );
+        //}
+      }
+    }
+#endif
+#endif
+  }
+};
+
+} // namespace
+
+TEST_F( TEST_CATEGORY, range_for )
+{
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >f(0); f.test_for(); }
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(0); f.test_for(); }
+
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >f(2); f.test_for(); }
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(3); f.test_for(); }
+
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >f(1000); f.test_for(); }
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(1001); f.test_for(); }
+}
+
+TEST_F( TEST_CATEGORY, range_reduce )
+{
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >f(0); f.test_reduce(); }
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(0); f.test_reduce(); }
+
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >f(2); f.test_reduce(); }
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(3); f.test_reduce(); }
+
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >f(1000); f.test_reduce(); }
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(1001); f.test_reduce(); }
+}
+
+#ifndef KOKKOS_ENABLE_OPENMPTARGET 
+TEST_F( TEST_CATEGORY, range_scan )
+{
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >f(0); f.test_scan(); }
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(0); f.test_scan(); }
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_ROCM)
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(0); f.test_dynamic_policy(); }
+#endif
+
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >f(2); f.test_scan(); }
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(3); f.test_scan(); }
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_ROCM)
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(3); f.test_dynamic_policy(); }
+#endif
+
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >f(1000); f.test_scan(); }
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(1001); f.test_scan(); }
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_ROCM)
+  { TestRange< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >f(1001); f.test_dynamic_policy(); }
+#endif
+}
+#endif
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestReduce.hpp b/packages/kokkos/core/unit_test/TestReduce.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf8c4d5d0d128dbd2cd01a7f76465be619a9c760
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestReduce.hpp
@@ -0,0 +1,1380 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+#include <limits>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+template< typename ScalarType, class DeviceType >
+class ReduceFunctor
+{
+public:
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
+
+  struct value_type {
+    ScalarType value[3];
+  };
+
+  const size_type nwork;
+
+  ReduceFunctor( const size_type & arg_nwork )
+    : nwork( arg_nwork ) {}
+
+  ReduceFunctor( const ReduceFunctor & rhs )
+    : nwork( rhs.nwork ) {}
+
+/*
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & dst ) const
+  {
+    dst.value[0] = 0;
+    dst.value[1] = 0;
+    dst.value[2] = 0;
+  }
+*/
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst,
+             const volatile value_type & src ) const
+  {
+    dst.value[0] += src.value[0];
+    dst.value[1] += src.value[1];
+    dst.value[2] += src.value[2];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type iwork, value_type & dst ) const
+  {
+    dst.value[0] += 1;
+    dst.value[1] += iwork + 1;
+    dst.value[2] += nwork - iwork;
+  }
+};
+
+template< class DeviceType >
+class ReduceFunctorFinal : public ReduceFunctor< long, DeviceType > {
+public:
+  typedef typename ReduceFunctor< long, DeviceType >::value_type value_type;
+
+  ReduceFunctorFinal( const size_t n )
+    : ReduceFunctor< long, DeviceType >( n ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & dst ) const
+  {
+    dst.value[0] = -dst.value[0];
+    dst.value[1] = -dst.value[1];
+    dst.value[2] = -dst.value[2];
+  }
+};
+
+template< typename ScalarType, class DeviceType >
+class RuntimeReduceFunctor
+{
+public:
+  // Required for functor:
+  typedef DeviceType  execution_space;
+  typedef ScalarType  value_type[];
+  const unsigned      value_count;
+
+  // Unit test details:
+
+  typedef typename execution_space::size_type size_type;
+
+  const size_type     nwork;
+
+  RuntimeReduceFunctor( const size_type arg_nwork,
+                        const size_type arg_count )
+    : value_count( arg_count )
+    , nwork( arg_nwork ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( ScalarType dst[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile ScalarType dst[],
+             const volatile ScalarType src[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] += src[i];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type iwork, ScalarType dst[] ) const
+  {
+    const size_type tmp[3] = { 1, iwork + 1, nwork - iwork };
+
+    for ( size_type i = 0; i < value_count; ++i ) {
+      dst[i] += tmp[ i % 3 ];
+    }
+  }
+};
+
+template< typename ScalarType, class DeviceType >
+class RuntimeReduceMinMax
+{
+public:
+  // Required for functor:
+  typedef DeviceType  execution_space;
+  typedef ScalarType  value_type[];
+  const unsigned      value_count;
+
+  // Unit test details:
+
+  typedef typename execution_space::size_type size_type;
+
+  const size_type     nwork;
+  const ScalarType    amin;
+  const ScalarType    amax;
+
+  RuntimeReduceMinMax( const size_type arg_nwork,
+                       const size_type arg_count )
+    : value_count( arg_count )
+    , nwork( arg_nwork )
+    , amin( std::numeric_limits< ScalarType >::min() )
+    , amax( std::numeric_limits< ScalarType >::max() )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( ScalarType dst[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) {
+      dst[i] = i % 2 ? amax : amin;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile ScalarType dst[],
+             const volatile ScalarType src[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) {
+      dst[i] = i % 2 ? ( dst[i] < src[i] ? dst[i] : src[i] )  // min
+                     : ( dst[i] > src[i] ? dst[i] : src[i] ); // max
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type iwork, ScalarType dst[] ) const
+  {
+    const ScalarType tmp[2] = { ScalarType( iwork + 1 )
+                              , ScalarType( nwork - iwork ) };
+
+    for ( size_type i = 0; i < value_count; ++i ) {
+      dst[i] = i % 2 ? ( dst[i] < tmp[i % 2] ? dst[i] : tmp[i % 2] )
+                     : ( dst[i] > tmp[i % 2] ? dst[i] : tmp[i % 2] );
+    }
+  }
+};
+
+template< class DeviceType >
+class RuntimeReduceFunctorFinal : public RuntimeReduceFunctor< long, DeviceType > {
+public:
+  typedef RuntimeReduceFunctor< long, DeviceType > base_type;
+  typedef typename base_type::value_type value_type;
+  typedef long scalar_type;
+
+  RuntimeReduceFunctorFinal( const size_t theNwork, const size_t count )
+    : base_type( theNwork, count ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type dst ) const
+  {
+    for ( unsigned i = 0; i < base_type::value_count; ++i ) {
+      dst[i] = -dst[i];
+    }
+  }
+};
+
+} // namespace Test
+
+namespace {
+
+template< typename ScalarType, class DeviceType >
+class TestReduce
+{
+public:
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
+
+  TestReduce( const size_type & nwork )
+  {
+    run_test( nwork );
+    run_test_final( nwork );
+  }
+
+  void run_test( const size_type & nwork )
+  {
+    typedef Test::ReduceFunctor< ScalarType, execution_space > functor_type;
+    typedef typename functor_type::value_type value_type;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    value_type result[ Repeat ];
+
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      Kokkos::parallel_reduce( nwork, functor_type( nwork ), result[i] );
+    }
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, result[i].value[j] );
+      }
+    }
+  }
+
+  void run_test_final( const size_type & nwork )
+  {
+    typedef Test::ReduceFunctorFinal< execution_space > functor_type;
+    typedef typename functor_type::value_type value_type;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    value_type result[ Repeat ];
+
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "Reduce", nwork, functor_type( nwork ), result[i] );
+      }
+    }
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, -result[i].value[j] );
+      }
+    }
+  }
+};
+
+template< typename ScalarType, class DeviceType >
+class TestReduceDynamic
+{
+public:
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
+
+  TestReduceDynamic( const size_type nwork )
+  {
+    run_test_dynamic( nwork );
+    run_test_dynamic_minmax( nwork );
+    run_test_dynamic_final( nwork );
+  }
+
+  void run_test_dynamic( const size_type nwork )
+  {
+    typedef Test::RuntimeReduceFunctor< ScalarType, execution_space > functor_type;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    ScalarType result[ Repeat ][ Count ];
+
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork, Count ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "Reduce", nwork, functor_type( nwork, Count ), result[i] );
+      }
+    }
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, result[i][j] );
+      }
+    }
+  }
+
+  void run_test_dynamic_minmax( const size_type nwork )
+  {
+    typedef Test::RuntimeReduceMinMax< ScalarType, execution_space > functor_type;
+
+    enum { Count = 2 };
+    enum { Repeat = 100 };
+
+    ScalarType result[ Repeat ][ Count ];
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork, Count ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "Reduce", nwork, functor_type( nwork, Count ), result[i] );
+      }
+    }
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        if ( nwork == 0 )
+        {
+          ScalarType amin( std::numeric_limits< ScalarType >::min() );
+          ScalarType amax( std::numeric_limits< ScalarType >::max() );
+          const ScalarType correct = ( j % 2 ) ? amax : amin;
+          ASSERT_EQ( (ScalarType) correct, result[i][j] );
+        }
+        else {
+          const unsigned long correct = j % 2 ? 1 : nwork;
+          ASSERT_EQ( (ScalarType) correct, result[i][j] );
+        }
+      }
+    }
+  }
+
+  void run_test_dynamic_final( const size_type nwork )
+  {
+    typedef Test::RuntimeReduceFunctorFinal< execution_space > functor_type;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    typename functor_type::scalar_type result[ Repeat ][ Count ];
+
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork, Count ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "TestKernelReduce", nwork, functor_type( nwork, Count ), result[i] );
+      }
+
+    }
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, -result[i][j] );
+      }
+    }
+  }
+};
+
+template< typename ScalarType, class DeviceType >
+class TestReduceDynamicView
+{
+public:
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
+
+  TestReduceDynamicView( const size_type nwork )
+  {
+    run_test_dynamic_view( nwork );
+  }
+
+  void run_test_dynamic_view( const size_type nwork )
+  {
+    typedef Test::RuntimeReduceFunctor< ScalarType, execution_space > functor_type;
+
+    typedef Kokkos::View< ScalarType*, DeviceType > result_type;
+    typedef typename result_type::HostMirror result_host_type;
+
+    const unsigned CountLimit = 23;
+
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
+
+    for ( unsigned count = 0; count < CountLimit; ++count ) {
+
+      result_type result( "result", count );
+      result_host_type host_result = Kokkos::create_mirror( result );
+
+      // Test result to host pointer:
+
+      std::string str( "TestKernelReduce" );
+      if ( count % 2 == 0 ) {
+        Kokkos::parallel_reduce( nw, functor_type( nw, count ), host_result.data() );
+      }
+      else {
+        Kokkos::parallel_reduce( str, nw, functor_type( nw, count ), host_result.data() );
+      }
+
+      for ( unsigned j = 0; j < count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( host_result( j ), (ScalarType) correct );
+        host_result( j ) = 0;
+      }
+    }
+  }
+};
+
+} // namespace
+
+
+//--------------------------------------------------------------------------
+
+namespace Test {
+
+struct ReducerTag {};
+
+template< class Scalar, class ExecSpace = Kokkos::DefaultExecutionSpace >
+struct TestReducers {
+  struct SumFunctor {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int & i, Scalar & value ) const {
+      value += values( i );
+    }
+  };
+
+  struct ProdFunctor {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int & i, Scalar & value ) const {
+      value *= values( i );
+    }
+  };
+
+  struct MinFunctor {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int & i, Scalar & value ) const {
+      if ( values( i ) < value ) value = values( i );
+    }
+  };
+
+  struct MaxFunctor {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int & i, Scalar & value ) const {
+      if ( values( i ) > value ) value = values( i );
+    }
+  };
+
+  struct MinLocFunctor {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int & i, typename Kokkos::Experimental::MinLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) < value.val ) {
+        value.val = values( i );
+        value.loc = i;
+      }
+    }
+  };
+
+  struct MaxLocFunctor {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int & i, typename Kokkos::Experimental::MaxLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) > value.val ) {
+        value.val = values( i );
+        value.loc = i;
+      }
+    }
+  };
+
+  struct MinMaxLocFunctor {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int & i, typename Kokkos::Experimental::MinMaxLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) > value.max_val ) {
+        value.max_val = values( i );
+        value.max_loc = i;
+      }
+
+      if ( values( i ) < value.min_val ) {
+        value.min_val = values( i );
+        value.min_loc = i;
+      }
+    }
+  };
+
+  struct BAndFunctor {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int & i, Scalar & value ) const {
+      value = value & values( i );
+    }
+  };
+
+  struct BOrFunctor {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int & i, Scalar & value ) const {
+      value = value | values( i );
+    }
+  };
+
+  struct LAndFunctor {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int & i, Scalar & value ) const {
+      value = value && values( i );
+    }
+  };
+
+  struct LOrFunctor {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int & i, Scalar & value ) const {
+      value = value || values( i );
+    }
+  };
+
+  struct SumFunctorTag {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const ReducerTag, const int & i, Scalar & value ) const {
+      value += values( i );
+    }
+  };
+
+  struct ProdFunctorTag {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const ReducerTag, const int & i, Scalar & value ) const {
+      value *= values( i );
+    }
+  };
+
+  struct MinFunctorTag {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const ReducerTag, const int & i, Scalar & value ) const {
+      if ( values( i ) < value ) value = values( i );
+    }
+  };
+
+  struct MaxFunctorTag {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const ReducerTag, const int & i, Scalar & value ) const {
+      if ( values( i ) > value ) value = values( i );
+    }
+  };
+
+  struct MinLocFunctorTag {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const ReducerTag, const int & i, typename Kokkos::Experimental::MinLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) < value.val ) {
+        value.val = values( i );
+        value.loc = i;
+      }
+    }
+  };
+
+  struct MaxLocFunctorTag {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const ReducerTag, const int & i, typename Kokkos::Experimental::MaxLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) > value.val ) {
+        value.val = values( i );
+        value.loc = i;
+      }
+    }
+  };
+
+  struct MinMaxLocFunctorTag {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const ReducerTag, const int & i, typename Kokkos::Experimental::MinMaxLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) > value.max_val ) {
+        value.max_val = values( i );
+        value.max_loc = i;
+      }
+
+      if ( values( i ) < value.min_val ) {
+        value.min_val = values( i );
+        value.min_loc = i;
+      }
+    }
+  };
+
+  struct BAndFunctorTag {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const ReducerTag, const int & i, Scalar & value ) const {
+      value = value & values( i );
+    }
+  };
+
+  struct BOrFunctorTag {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const ReducerTag, const int & i, Scalar & value ) const {
+      value = value | values( i );
+    }
+  };
+
+  struct LAndFunctorTag {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const ReducerTag, const int & i, Scalar & value ) const {
+      value = value && values( i );
+    }
+  };
+
+  struct LOrFunctorTag {
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const ReducerTag, const int & i, Scalar & value ) const {
+      value = value || values( i );
+    }
+  };
+  static void test_sum( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_sum = 0;
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100 );
+      reference_sum += h_values( i );
+    }
+    Kokkos::deep_copy( values, h_values );
+
+    SumFunctor f;
+    f.values = values;
+    SumFunctorTag f_tag;
+    f_tag.values = values;
+    Scalar init = 0;
+
+    {
+      Scalar sum_scalar = init;
+      Kokkos::Experimental::Sum< Scalar > reducer_scalar( sum_scalar );
+      
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+      ASSERT_EQ( sum_scalar, reference_sum );
+     
+      sum_scalar = init;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar );
+      ASSERT_EQ( sum_scalar, reference_sum );
+
+      Scalar sum_scalar_view = reducer_scalar.reference();
+      ASSERT_EQ( sum_scalar_view, reference_sum );
+    }
+
+    {
+      Kokkos::View< Scalar, Kokkos::HostSpace> sum_view( "View" );
+      sum_view() = init;
+      Kokkos::Experimental::Sum< Scalar > reducer_view( sum_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
+      Scalar sum_view_scalar = sum_view();
+      ASSERT_EQ( sum_view_scalar, reference_sum );
+
+      Scalar sum_view_view = reducer_view.reference();
+      ASSERT_EQ( sum_view_view, reference_sum );
+    }
+  }
+
+  static void test_prod( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_prod = 1;
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 4 + 1 );
+      reference_prod *= h_values( i );
+    }
+    Kokkos::deep_copy( values, h_values );
+
+    ProdFunctor f;
+    f.values = values;
+    ProdFunctorTag f_tag;
+    f_tag.values = values;
+    Scalar init = 1;
+
+    {
+      Scalar prod_scalar = init;
+      Kokkos::Experimental::Prod< Scalar > reducer_scalar( prod_scalar );
+   
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+      ASSERT_EQ( prod_scalar, reference_prod );
+      
+      prod_scalar = init;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar );
+      ASSERT_EQ( prod_scalar, reference_prod );
+
+      Scalar prod_scalar_view = reducer_scalar.reference();
+      ASSERT_EQ( prod_scalar_view, reference_prod );
+    }
+
+    {
+      Kokkos::View< Scalar, Kokkos::HostSpace > prod_view( "View" );
+      prod_view() = init;
+      Kokkos::Experimental::Prod< Scalar > reducer_view( prod_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
+      Scalar prod_view_scalar = prod_view();
+      ASSERT_EQ( prod_view_scalar, reference_prod );
+
+      Scalar prod_view_view = reducer_view.reference();
+      ASSERT_EQ( prod_view_view, reference_prod );
+    }
+  }
+
+  static void test_min( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_min = std::numeric_limits< Scalar >::max();
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 );
+
+      if ( h_values( i ) < reference_min ) reference_min = h_values( i );
+    }
+    Kokkos::deep_copy( values, h_values );
+
+    MinFunctor f;
+    f.values = values;
+    MinFunctorTag f_tag;
+    f_tag.values = values;
+    Scalar init = std::numeric_limits< Scalar >::max();
+
+    {
+      Scalar min_scalar = init;
+      Kokkos::Experimental::Min< Scalar > reducer_scalar( min_scalar );
+     
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+      ASSERT_EQ( min_scalar, reference_min );
+
+      min_scalar = init;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar );
+      ASSERT_EQ( min_scalar, reference_min );
+
+      Scalar min_scalar_view = reducer_scalar.reference();
+      ASSERT_EQ( min_scalar_view, reference_min );
+    }
+
+    {
+      Kokkos::View< Scalar, Kokkos::HostSpace > min_view( "View" );
+      min_view() = init;
+      Kokkos::Experimental::Min< Scalar > reducer_view( min_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
+      Scalar min_view_scalar = min_view();
+      ASSERT_EQ( min_view_scalar, reference_min );
+
+      Scalar min_view_view = reducer_view.reference();
+      ASSERT_EQ( min_view_view, reference_min );
+    }
+  }
+
+  static void test_max( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_max = std::numeric_limits< Scalar >::min();
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 + 1 );
+
+      if ( h_values( i ) > reference_max ) reference_max = h_values( i );
+    }
+    Kokkos::deep_copy( values, h_values );
+
+    MaxFunctor f;
+    f.values = values;
+    MaxFunctorTag f_tag;
+    f_tag.values = values;
+    Scalar init = std::numeric_limits< Scalar >::min();
+
+    {
+      Scalar max_scalar = init;
+      Kokkos::Experimental::Max< Scalar > reducer_scalar( max_scalar );
+
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+      ASSERT_EQ( max_scalar, reference_max );
+
+      max_scalar = init;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar );
+      ASSERT_EQ( max_scalar, reference_max );
+
+      Scalar max_scalar_view = reducer_scalar.reference();
+      ASSERT_EQ( max_scalar_view, reference_max );
+    }
+
+    {
+      Kokkos::View< Scalar, Kokkos::HostSpace > max_view( "View" );
+      max_view() = init;
+      Kokkos::Experimental::Max< Scalar > reducer_view( max_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
+      Scalar max_view_scalar = max_view();
+      ASSERT_EQ( max_view_scalar, reference_max );
+
+      Scalar max_view_view = reducer_view.reference();
+      ASSERT_EQ( max_view_view, reference_max );
+    }
+  }
+
+  static void test_minloc( int N ) {
+    typedef typename Kokkos::Experimental::MinLoc< Scalar, int >::value_type value_type;
+
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_min = std::numeric_limits< Scalar >::max();
+    int reference_loc = -1;
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 );
+
+      if ( h_values( i ) < reference_min ) {
+        reference_min = h_values( i );
+        reference_loc = i;
+      }
+      else if ( h_values( i ) == reference_min ) {
+        // Make min unique.
+        h_values( i ) += std::numeric_limits< Scalar >::epsilon();
+      }
+    }
+    Kokkos::deep_copy( values, h_values );
+
+    MinLocFunctor f;
+    f.values = values;
+    MinLocFunctorTag f_tag;
+    f_tag.values = values;
+
+    {
+      value_type min_scalar;
+      Kokkos::Experimental::MinLoc< Scalar, int > reducer_scalar( min_scalar );
+
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+      ASSERT_EQ( min_scalar.val, reference_min );
+      ASSERT_EQ( min_scalar.loc, reference_loc );
+
+      min_scalar = value_type();
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar );
+      ASSERT_EQ( min_scalar.val, reference_min );
+      ASSERT_EQ( min_scalar.loc, reference_loc );
+
+      value_type min_scalar_view = reducer_scalar.reference();
+      ASSERT_EQ( min_scalar_view.val, reference_min );
+      ASSERT_EQ( min_scalar_view.loc, reference_loc );
+    }
+
+    {
+      Kokkos::View< value_type, Kokkos::HostSpace > min_view( "View" );
+      Kokkos::Experimental::MinLoc< Scalar, int > reducer_view( min_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
+      value_type min_view_scalar = min_view();
+      ASSERT_EQ( min_view_scalar.val, reference_min );
+      ASSERT_EQ( min_view_scalar.loc, reference_loc );
+
+      value_type min_view_view = reducer_view.reference();
+      ASSERT_EQ( min_view_view.val, reference_min );
+      ASSERT_EQ( min_view_view.loc, reference_loc );
+    }
+  }
+
+  static void test_maxloc( int N ) {
+    typedef typename Kokkos::Experimental::MaxLoc< Scalar, int >::value_type value_type;
+
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_max = std::numeric_limits< Scalar >::min();
+    int reference_loc = -1;
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 );
+
+      if ( h_values( i ) > reference_max ) {
+        reference_max = h_values( i );
+        reference_loc = i;
+      }
+      else if ( h_values( i ) == reference_max ) {
+        // Make max unique.
+        h_values( i ) -= std::numeric_limits< Scalar >::epsilon();
+      }
+    }
+    Kokkos::deep_copy( values, h_values );
+
+    MaxLocFunctor f;
+    f.values = values;
+    MaxLocFunctorTag f_tag;
+    f_tag.values = values;
+
+    {
+      value_type max_scalar;
+      Kokkos::Experimental::MaxLoc< Scalar, int > reducer_scalar( max_scalar );
+
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+      ASSERT_EQ( max_scalar.val, reference_max );
+      ASSERT_EQ( max_scalar.loc, reference_loc );
+
+      max_scalar = value_type();
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar );
+      ASSERT_EQ( max_scalar.val, reference_max );
+      ASSERT_EQ( max_scalar.loc, reference_loc );
+
+      value_type max_scalar_view = reducer_scalar.reference();
+      ASSERT_EQ( max_scalar_view.val, reference_max );
+      ASSERT_EQ( max_scalar_view.loc, reference_loc );
+    }
+
+    {
+      Kokkos::View< value_type, Kokkos::HostSpace > max_view( "View" );
+      Kokkos::Experimental::MaxLoc< Scalar, int > reducer_view( max_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
+      value_type max_view_scalar = max_view();
+      ASSERT_EQ( max_view_scalar.val, reference_max );
+      ASSERT_EQ( max_view_scalar.loc, reference_loc );
+
+      value_type max_view_view = reducer_view.reference();
+      ASSERT_EQ( max_view_view.val, reference_max );
+      ASSERT_EQ( max_view_view.loc, reference_loc );
+    }
+  }
+
+  static void test_minmaxloc( int N ) {
+     typedef typename Kokkos::Experimental::MinMaxLoc< Scalar, int >::value_type value_type;
+
+     Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+     auto h_values = Kokkos::create_mirror_view( values );
+     Scalar reference_max = std::numeric_limits< Scalar >::min();
+     Scalar reference_min = std::numeric_limits< Scalar >::max();
+     int reference_minloc = -1;
+     int reference_maxloc = -1;
+
+     for ( int i = 0; i < N; i++ ) {
+       h_values( i ) = (Scalar) ( rand() % 100000 );
+     }
+
+     for ( int i = 0; i < N; i++ ) {
+       if ( h_values( i ) > reference_max ) {
+         reference_max = h_values( i );
+         reference_maxloc = i;
+       }
+       else if ( h_values( i ) == reference_max ) {
+         // Make max unique.
+         h_values( i ) -= std::numeric_limits< Scalar >::epsilon();
+       }
+     }
+
+     for ( int i = 0; i < N; i++ ) {
+       if ( h_values( i ) < reference_min ) {
+         reference_min = h_values( i );
+         reference_minloc = i;
+       }
+       else if ( h_values( i ) == reference_min ) {
+         // Make min unique.
+         h_values( i ) += std::numeric_limits< Scalar >::epsilon();
+       }
+     }
+
+     Kokkos::deep_copy( values, h_values );
+
+     MinMaxLocFunctor f;
+     f.values = values;
+     MinMaxLocFunctorTag f_tag;
+     f_tag.values = values;
+
+     {
+       value_type minmax_scalar;
+       Kokkos::Experimental::MinMaxLoc< Scalar, int > reducer_scalar( minmax_scalar );
+
+       Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+       ASSERT_EQ( minmax_scalar.min_val, reference_min );
+
+       for ( int i = 0; i < N; i++ ) {
+         if ( ( i == minmax_scalar.min_loc ) && ( h_values( i ) == reference_min ) ) {
+           reference_minloc = i;
+         }
+       }
+
+       ASSERT_EQ( minmax_scalar.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_scalar.max_val, reference_max );
+
+       for ( int i = 0; i < N; i++ ) {
+         if ( ( i == minmax_scalar.max_loc ) && ( h_values( i ) == reference_max ) ) {
+           reference_maxloc = i;
+         }
+       }
+
+       ASSERT_EQ( minmax_scalar.max_loc, reference_maxloc );
+
+       minmax_scalar = value_type();
+       Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar );
+       ASSERT_EQ( minmax_scalar.min_val, reference_min );
+
+       for ( int i = 0; i < N; i++ ) {
+         if ( ( i == minmax_scalar.min_loc ) && ( h_values( i ) == reference_min ) ) {
+           reference_minloc = i;
+         }
+       }
+
+       ASSERT_EQ( minmax_scalar.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_scalar.max_val, reference_max );
+
+       for ( int i = 0; i < N; i++ ) {
+         if ( ( i == minmax_scalar.max_loc ) && ( h_values( i ) == reference_max ) ) {
+           reference_maxloc = i;
+         }
+       }
+
+       ASSERT_EQ( minmax_scalar.max_loc, reference_maxloc );
+
+       value_type minmax_scalar_view = reducer_scalar.reference();
+       ASSERT_EQ( minmax_scalar_view.min_val, reference_min );
+       ASSERT_EQ( minmax_scalar_view.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_scalar_view.max_val, reference_max );
+       ASSERT_EQ( minmax_scalar_view.max_loc, reference_maxloc );
+     }
+
+     {
+       Kokkos::View< value_type, Kokkos::HostSpace > minmax_view( "View" );
+       Kokkos::Experimental::MinMaxLoc< Scalar, int > reducer_view( minmax_view );
+       Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
+       value_type minmax_view_scalar = minmax_view();
+       ASSERT_EQ( minmax_view_scalar.min_val, reference_min );
+       ASSERT_EQ( minmax_view_scalar.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_view_scalar.max_val, reference_max );
+       ASSERT_EQ( minmax_view_scalar.max_loc, reference_maxloc );
+
+       value_type minmax_view_view = reducer_view.reference();
+       ASSERT_EQ( minmax_view_view.min_val, reference_min );
+       ASSERT_EQ( minmax_view_view.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_view_view.max_val, reference_max );
+       ASSERT_EQ( minmax_view_view.max_loc, reference_maxloc );
+     }
+   }
+
+  static void test_BAnd( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_band = Scalar() | ( ~Scalar() );
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 + 1 );
+      reference_band = reference_band & h_values( i );
+    }
+    Kokkos::deep_copy( values, h_values );
+
+    BAndFunctor f;
+    f.values = values;
+    BAndFunctorTag f_tag;
+    f_tag.values = values;
+    Scalar init = Scalar() | ( ~Scalar() );
+
+    {
+      Scalar band_scalar = init;
+      Kokkos::Experimental::BAnd< Scalar > reducer_scalar( band_scalar );
+
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+      ASSERT_EQ( band_scalar, reference_band );
+
+      band_scalar = init;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar );
+      ASSERT_EQ( band_scalar, reference_band );
+
+      Scalar band_scalar_view = reducer_scalar.reference();
+
+      ASSERT_EQ( band_scalar_view, reference_band );
+    }
+
+    {
+      Kokkos::View< Scalar, Kokkos::HostSpace > band_view( "View" );
+      band_view() = init;
+      Kokkos::Experimental::BAnd< Scalar > reducer_view( band_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
+      Scalar band_view_scalar = band_view();
+      ASSERT_EQ( band_view_scalar, reference_band );
+
+      Scalar band_view_view = reducer_view.reference();
+      ASSERT_EQ( band_view_view, reference_band );
+    }
+  }
+
+  static void test_BOr( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_bor = Scalar() & ( ~Scalar() );
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( ( rand() % 100000 + 1 ) * 2 );
+      reference_bor = reference_bor | h_values( i );
+    }
+    Kokkos::deep_copy( values, h_values );
+
+    BOrFunctor f;
+    f.values = values;
+    BOrFunctorTag f_tag;
+    f_tag.values = values;
+    Scalar init = Scalar() & ( ~Scalar() );
+
+    {
+      Scalar bor_scalar = init;
+      Kokkos::Experimental::BOr< Scalar > reducer_scalar( bor_scalar );
+
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+      ASSERT_EQ( bor_scalar, reference_bor );
+
+      bor_scalar = init;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar );
+      ASSERT_EQ( bor_scalar, reference_bor );
+
+      Scalar bor_scalar_view = reducer_scalar.reference();
+      ASSERT_EQ( bor_scalar_view, reference_bor );
+    }
+
+    {
+      Kokkos::View< Scalar, Kokkos::HostSpace > bor_view( "View" );
+      bor_view() = init;
+      Kokkos::Experimental::BOr< Scalar > reducer_view( bor_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
+      Scalar bor_view_scalar = bor_view();
+      ASSERT_EQ( bor_view_scalar, reference_bor );
+
+      Scalar bor_view_view = reducer_view.reference();
+      ASSERT_EQ( bor_view_view, reference_bor );
+    }
+  }
+
+  static void test_LAnd( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_land = 1;
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 2 );
+      reference_land = reference_land && h_values( i );
+    }
+    Kokkos::deep_copy( values, h_values );
+
+    LAndFunctor f;
+    f.values = values;
+    LAndFunctorTag f_tag;
+    f_tag.values = values;
+    Scalar init = 1;
+
+    {
+      Scalar land_scalar = init;
+      Kokkos::Experimental::LAnd< Scalar > reducer_scalar( land_scalar );
+
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+      ASSERT_EQ( land_scalar, reference_land );
+
+      land_scalar = init;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar );
+      ASSERT_EQ( land_scalar, reference_land );
+
+      Scalar land_scalar_view = reducer_scalar.reference();
+      ASSERT_EQ( land_scalar_view, reference_land );
+    }
+
+    {
+      Kokkos::View< Scalar, Kokkos::HostSpace > land_view( "View" );
+      land_view() = init;
+      Kokkos::Experimental::LAnd< Scalar > reducer_view( land_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
+      Scalar land_view_scalar = land_view();
+      ASSERT_EQ( land_view_scalar, reference_land );
+
+      Scalar land_view_view = reducer_view.reference();
+      ASSERT_EQ( land_view_view, reference_land );
+    }
+  }
+
+  static void test_LOr( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_lor = 0;
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 2 );
+      reference_lor = reference_lor || h_values( i );
+    }
+    Kokkos::deep_copy( values, h_values );
+
+    LOrFunctor f;
+    f.values = values;
+    LOrFunctorTag f_tag;
+    f_tag.values = values;
+    Scalar init = 0;
+
+    {
+      Scalar lor_scalar = init;
+      Kokkos::Experimental::LOr< Scalar > reducer_scalar( lor_scalar );
+
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+      ASSERT_EQ( lor_scalar, reference_lor );
+
+      lor_scalar = init;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar );
+      ASSERT_EQ( lor_scalar, reference_lor );
+
+      Scalar lor_scalar_view = reducer_scalar.reference();
+      ASSERT_EQ( lor_scalar_view, reference_lor );
+    }
+
+    {
+      Kokkos::View< Scalar, Kokkos::HostSpace > lor_view( "View" );
+      lor_view() = init;
+      Kokkos::Experimental::LOr< Scalar > reducer_view( lor_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
+      Scalar lor_view_scalar = lor_view();
+      ASSERT_EQ( lor_view_scalar, reference_lor );
+
+      Scalar lor_view_view = reducer_view.reference();
+      ASSERT_EQ( lor_view_view, reference_lor );
+    }
+  }
+
+  static void execute_float() {
+    test_sum( 10001 );
+    test_prod( 35 );
+    test_min( 10003 );
+    test_minloc( 10003 );
+    test_max( 10007 );
+    test_maxloc( 10007 );
+    test_minmaxloc( 10007 );
+  }
+
+  static void execute_integer() {
+    test_sum( 10001 );
+    test_prod( 35 );
+    test_min( 10003 );
+    test_minloc( 10003 );
+    test_max( 10007 );
+    test_maxloc( 10007 );
+    test_minmaxloc( 10007 );
+    test_BAnd( 35 );
+    test_BOr( 35 );
+    test_LAnd( 35 );
+    test_LOr( 35 );
+  }
+
+  static void execute_basic() {
+    test_sum( 10001 );
+    test_prod( 35 );
+  }
+};
+
+
+TEST_F( TEST_CATEGORY, long_reduce )
+{
+  TestReduce< long, TEST_EXECSPACE >( 0 );
+  TestReduce< long, TEST_EXECSPACE >( 1000000 );
+}
+
+TEST_F( TEST_CATEGORY, double_reduce )
+{
+  TestReduce< double, TEST_EXECSPACE >( 0 );
+  TestReduce< double, TEST_EXECSPACE >( 1000000 );
+}
+
+TEST_F( TEST_CATEGORY, reducers )
+{
+  TestReducers< int, TEST_EXECSPACE >::execute_integer();
+  TestReducers< size_t, TEST_EXECSPACE >::execute_integer();
+  TestReducers< double, TEST_EXECSPACE >::execute_float();
+  TestReducers< Kokkos::complex<double>, TEST_EXECSPACE >::execute_basic();
+}
+
+TEST_F( TEST_CATEGORY, long_reduce_dynamic )
+{
+  TestReduceDynamic< long, TEST_EXECSPACE >( 0 );
+  TestReduceDynamic< long, TEST_EXECSPACE >( 1000000 );
+}
+
+TEST_F( TEST_CATEGORY, double_reduce_dynamic )
+{
+  TestReduceDynamic< double, TEST_EXECSPACE >( 0 );
+  TestReduceDynamic< double, TEST_EXECSPACE >( 1000000 );
+}
+
+TEST_F( TEST_CATEGORY, long_reduce_dynamic_view )
+{
+  TestReduceDynamicView< long, TEST_EXECSPACE >( 0 );
+  TestReduceDynamicView< long, TEST_EXECSPACE >( 1000000 );
+}
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3b5fe712cc69ac1f30f4b13b7bb03faeed8841e1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
@@ -0,0 +1,597 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+#include <limits>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace ReduceCombinatorical {
+
+template< class Scalar, class Space = Kokkos::HostSpace >
+struct AddPlus {
+public:
+  // Required.
+  typedef AddPlus reducer;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View< value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  result_view_type result;
+
+public:
+  AddPlus( value_type & result_ ) : result( &result_ ) {}
+
+  // Required.
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type & dest, const value_type & src ) const {
+    dest += src + 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dest, const volatile value_type & src ) const {
+    dest += src + 1;
+  }
+
+  // Optional.
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & val )  const {
+    val = value_type();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type& reference() const {
+    return result();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  result_view_type view() const {
+    return result;
+  }
+};
+
+template< int ISTEAM >
+struct FunctorScalar;
+
+template<>
+struct FunctorScalar< 0 > {
+  Kokkos::View< double > result;
+
+  FunctorScalar( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & i, double & update ) const {
+    update += i;
+  }
+};
+
+template<>
+struct FunctorScalar< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+
+  Kokkos::View< double > result;
+
+  FunctorScalar( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
+  }
+};
+
+template< int ISTEAM >
+struct FunctorScalarInit;
+
+template<>
+struct FunctorScalarInit< 0 > {
+  Kokkos::View< double > result;
+
+  FunctorScalarInit( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & i, double & update ) const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( double & update ) const {
+    update = 0.0;
+  }
+};
+
+template<>
+struct FunctorScalarInit< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+
+  Kokkos::View< double > result;
+
+  FunctorScalarInit( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( double & update ) const {
+    update = 0.0;
+  }
+};
+
+template< int ISTEAM >
+struct FunctorScalarFinal;
+
+template<>
+struct FunctorScalarFinal< 0 > {
+  Kokkos::View<double> result;
+
+  FunctorScalarFinal( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & i, double & update ) const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final( double & update ) const {
+    result() = update;
+  }
+};
+
+template<>
+struct FunctorScalarFinal< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+
+  Kokkos::View< double > result;
+
+  FunctorScalarFinal( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final( double & update ) const {
+    result() = update;
+  }
+};
+
+template< int ISTEAM >
+struct FunctorScalarJoin;
+
+template<>
+struct FunctorScalarJoin< 0 > {
+  Kokkos::View<double> result;
+
+  FunctorScalarJoin( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & i, double & update ) const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile double & dst, const volatile double & update ) const {
+    dst += update;
+  }
+};
+
+template<>
+struct FunctorScalarJoin< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+
+  Kokkos::View< double > result;
+
+  FunctorScalarJoin( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile double & dst, const volatile double & update ) const {
+    dst += update;
+  }
+};
+
+template< int ISTEAM >
+struct FunctorScalarJoinFinal;
+
+template<>
+struct FunctorScalarJoinFinal< 0 > {
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinFinal( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & i, double & update ) const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile double & dst, const volatile double & update ) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final( double & update ) const {
+    result() = update;
+  }
+};
+
+template<>
+struct FunctorScalarJoinFinal< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinFinal( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile double & dst, const volatile double & update ) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final( double & update ) const {
+    result() = update;
+  }
+};
+
+template< int ISTEAM >
+struct FunctorScalarJoinInit;
+
+template<>
+struct FunctorScalarJoinInit< 0 > {
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinInit( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & i, double & update ) const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile double & dst, const volatile double & update ) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( double & update ) const {
+    update = 0.0;
+  }
+};
+
+template<>
+struct FunctorScalarJoinInit< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinInit( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile double & dst, const volatile double & update ) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( double & update ) const {
+    update = 0.0;
+  }
+};
+
+template< int ISTEAM >
+struct FunctorScalarJoinFinalInit;
+
+template<>
+struct FunctorScalarJoinFinalInit< 0 > {
+  Kokkos::View<double> result;
+
+  FunctorScalarJoinFinalInit( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & i, double & update ) const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile double & dst, const volatile double & update ) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final( double & update ) const {
+    result() = update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( double & update ) const {
+    update = 0.0;
+  }
+};
+
+template<>
+struct FunctorScalarJoinFinalInit< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinFinalInit( Kokkos::View< double > r ) : result( r ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile double & dst, const volatile double & update ) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final( double & update ) const {
+    result() = update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( double & update ) const {
+    update = 0.0;
+  }
+};
+
+struct Functor1 {
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & i, double & update ) const {
+    update += i;
+  }
+};
+
+struct Functor2 {
+  typedef double value_type[];
+
+  const unsigned value_count;
+
+  Functor2( unsigned n ) : value_count( n ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned & i, double update[] ) const {
+    for ( unsigned j = 0; j < value_count; j++ ) {
+      update[j] += i;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( double dst[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile double dst[],
+             const volatile double src[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] += src[i];
+  }
+};
+
+} // namespace ReduceCombinatorical
+
+template< class ExecSpace = Kokkos::DefaultExecutionSpace >
+struct TestReduceCombinatoricalInstantiation {
+  template< class ... Args >
+  static void CallParallelReduce( Args... args ) {
+    Kokkos::parallel_reduce( args... );
+  }
+
+  template< class ... Args >
+  static void AddReturnArgument( Args... args ) {
+    Kokkos::View< double, Kokkos::HostSpace > result_view( "ResultView" );
+    double expected_result = 1000.0 * 999.0 / 2.0;
+
+    double value = 0;
+    Kokkos::parallel_reduce( args..., value );
+    ASSERT_EQ( expected_result, value );
+
+    result_view() = 0;
+    CallParallelReduce( args..., result_view );
+    ASSERT_EQ( expected_result, result_view() );
+
+    value = 0;
+    CallParallelReduce( args..., Kokkos::View< double, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >( &value ) );
+    ASSERT_EQ( expected_result, value );
+
+    result_view() = 0;
+    const Kokkos::View< double, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_const_um = result_view;
+    CallParallelReduce( args..., result_view_const_um );
+    ASSERT_EQ( expected_result, result_view_const_um() );
+
+    value = 0;
+    CallParallelReduce( args..., Test::ReduceCombinatorical::AddPlus< double >( value ) );
+    if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) && ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result < value );
+    }
+    else if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) || ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result <= value );
+    }
+    else {
+      ASSERT_EQ( expected_result, value );
+    }
+
+    value = 0;
+    Test::ReduceCombinatorical::AddPlus< double > add( value );
+    CallParallelReduce( args..., add );
+    if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) && ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result < value );
+    }
+    else if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) || ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result <= value );
+    }
+    else {
+      ASSERT_EQ( expected_result, value );
+    }
+  }
+
+  template< class ... Args >
+  static void AddLambdaRange( void*, Args... args ) {
+    AddReturnArgument( args..., KOKKOS_LAMBDA ( const int & i, double & lsum ) {
+      lsum += i;
+    });
+  }
+
+  template< class ... Args >
+  static void AddLambdaTeam( void*, Args... args ) {
+    AddReturnArgument( args..., KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type & team, double & update ) {
+      update += 1.0 / team.team_size() * team.league_rank();
+    });
+  }
+
+  template< class ... Args >
+  static void AddLambdaRange( Kokkos::InvalidType, Args... args ) {}
+
+  template< class ... Args >
+  static void AddLambdaTeam( Kokkos::InvalidType, Args... args ) {}
+
+  template< int ISTEAM, class ... Args >
+  static void AddFunctor( Args... args ) {
+    Kokkos::View< double > result_view( "FunctorView" );
+    auto h_r = Kokkos::create_mirror_view( result_view );
+    Test::ReduceCombinatorical::FunctorScalar< ISTEAM > functor( result_view );
+    double expected_result = 1000.0 * 999.0 / 2.0;
+
+    AddReturnArgument( args..., functor );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalar< ISTEAM >( result_view ) );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalarInit< ISTEAM >( result_view ) );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalarJoin< ISTEAM >( result_view ) );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalarJoinInit< ISTEAM >( result_view ) );
+
+    h_r() = 0;
+    Kokkos::deep_copy( result_view, h_r );
+    CallParallelReduce( args..., Test::ReduceCombinatorical::FunctorScalarFinal< ISTEAM >( result_view ) );
+    Kokkos::deep_copy( h_r, result_view );
+    ASSERT_EQ( expected_result, h_r() );
+
+    h_r() = 0;
+    Kokkos::deep_copy( result_view, h_r );
+    CallParallelReduce( args..., Test::ReduceCombinatorical::FunctorScalarJoinFinal< ISTEAM >( result_view ) );
+    Kokkos::deep_copy( h_r, result_view );
+    ASSERT_EQ( expected_result, h_r() );
+
+    h_r() = 0;
+    Kokkos::deep_copy( result_view, h_r );
+    CallParallelReduce( args..., Test::ReduceCombinatorical::FunctorScalarJoinFinalInit< ISTEAM >( result_view ) );
+    Kokkos::deep_copy( h_r, result_view );
+    ASSERT_EQ( expected_result, h_r() );
+  }
+
+  template< class ... Args >
+  static void AddFunctorLambdaRange( Args... args ) {
+    AddFunctor< 0, Args... >( args... );
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+    AddLambdaRange( typename std::conditional< std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value, void*, Kokkos::InvalidType >::type(), args... );
+#endif
+  }
+
+  template< class ... Args >
+  static void AddFunctorLambdaTeam( Args... args ) {
+    AddFunctor< 1, Args... >( args... );
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+    AddLambdaTeam( typename std::conditional< std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value, void*, Kokkos::InvalidType >::type(), args... );
+#endif
+  }
+
+  template< class ... Args >
+  static void AddPolicy( Args... args ) {
+    int N = 1000;
+    Kokkos::RangePolicy< ExecSpace > policy( 0, N );
+
+    AddFunctorLambdaRange( args..., 1000 );
+    AddFunctorLambdaRange( args..., N );
+    AddFunctorLambdaRange( args..., policy );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace >( 0, N ) );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( 0, N ) );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Static> >( 0, N ).set_chunk_size( 10 ) );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( 0, N ).set_chunk_size( 10 ) );
+
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace >( N, Kokkos::AUTO ) );
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( N, Kokkos::AUTO ) );
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace, Kokkos::Schedule<Kokkos::Static> >( N, Kokkos::AUTO ).set_chunk_size( 10 ) );
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( N, Kokkos::AUTO ).set_chunk_size( 10 ) );
+  }
+
+  static void execute_a() {
+    AddPolicy();
+  }
+
+  static void execute_b() {
+    std::string s( "Std::String" );
+    AddPolicy( s.c_str() );
+    AddPolicy( "Char Constant" );
+  }
+
+  static void execute_c() {
+    std::string s( "Std::String" );
+    AddPolicy( s );
+  }
+};
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/TestResize.hpp b/packages/kokkos/core/unit_test/TestResize.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d62bc68b31d77583b16f4cf0dfce6394f5f099b8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestResize.hpp
@@ -0,0 +1,140 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef TESTVIEWSUBVIEW_HPP_
+#define TESTVIEWSUBVIEW_HPP_
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+namespace TestViewResize {
+
+template<class DeviceType>
+void testResize ()
+{
+  const int sizes[8] = {2, 3, 4, 5, 6, 7, 8, 9};
+
+  // Check #904 fix (no reallocation if dimensions didn't change).
+  {
+    typedef Kokkos::View<int*, DeviceType> view_type;
+    view_type view_1d ("view_1d", sizes[0]);
+    const int* oldPointer = view_1d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_1d, sizes[0]);
+    const int* newPointer = view_1d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int**, DeviceType> view_type;
+    view_type view_2d ("view_2d", sizes[0], sizes[1]);
+    const int* oldPointer = view_2d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_2d, sizes[0], sizes[1]);
+    const int* newPointer = view_2d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int***, DeviceType> view_type;
+    view_type view_3d ("view_3d", sizes[0], sizes[1], sizes[2]);
+    const int* oldPointer = view_3d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_3d, sizes[0], sizes[1], sizes[2]);
+    const int* newPointer = view_3d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int****, DeviceType> view_type;
+    view_type view_4d ("view_4d", sizes[0], sizes[1], sizes[2], sizes[3]);
+    const int* oldPointer = view_4d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_4d, sizes[0], sizes[1], sizes[2], sizes[3]);
+    const int* newPointer = view_4d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int*****, DeviceType> view_type;
+    view_type view_5d ("view_5d", sizes[0], sizes[1], sizes[2], sizes[3],
+                       sizes[4]);
+    const int* oldPointer = view_5d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_5d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4]);
+    const int* newPointer = view_5d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int******, DeviceType> view_type;
+    view_type view_6d ("view_6d", sizes[0], sizes[1], sizes[2], sizes[3],
+                       sizes[4], sizes[5]);
+    const int* oldPointer = view_6d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_6d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4],
+                    sizes[5]);
+    const int* newPointer = view_6d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int*******, DeviceType> view_type;
+    view_type view_7d ("view_7d", sizes[0], sizes[1], sizes[2], sizes[3],
+                       sizes[4], sizes[5], sizes[6]);
+    const int* oldPointer = view_7d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_7d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4],
+                    sizes[5], sizes[6]);
+    const int* newPointer = view_7d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int********, DeviceType> view_type;
+    view_type view_8d ("view_8d", sizes[0], sizes[1], sizes[2], sizes[3],
+                       sizes[4], sizes[5], sizes[6], sizes[7]);
+    const int* oldPointer = view_8d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_8d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4],
+                    sizes[5], sizes[6], sizes[7]);
+    const int* newPointer = view_8d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+}
+
+} // namespace TestViewSubview
+
+#endif // TESTVIEWSUBVIEW_HPP_
diff --git a/packages/kokkos/core/unit_test/TestScan.hpp b/packages/kokkos/core/unit_test/TestScan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a0948e7ace118114661a029e2826f3c45cecebe
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestScan.hpp
@@ -0,0 +1,142 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+namespace Test {
+
+template< class Device, class WorkSpec = size_t >
+struct TestScan {
+  typedef  Device    execution_space;
+  typedef  long int  value_type;
+
+  Kokkos::View< int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int iwork, value_type & update, const bool final_pass ) const
+  {
+    const value_type n = iwork + 1;
+    const value_type imbalance = ( ( 1000 <= n ) && ( 0 == n % 1000 ) ) ? 1000 : 0;
+
+    // Insert an artificial load imbalance
+
+    for ( value_type i = 0; i < imbalance; ++i ) { ++update; }
+
+    update += n - imbalance;
+
+    if ( final_pass ) {
+      const value_type answer = n & 1 ? ( n * ( ( n + 1 ) / 2 ) ) : ( ( n / 2 ) * ( n + 1 ) );
+
+      if ( answer != update ) {
+        errors()++;
+
+        if ( errors() < 20 ) {
+          printf( "TestScan(%d,%ld) != %ld\n", iwork, update, answer );
+        }
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const { update = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile       value_type & update,
+             volatile const value_type & input ) const
+  { update += input; }
+
+  TestScan( const WorkSpec & N )
+  {
+    Kokkos::View< int, Device > errors_a( "Errors" );
+    Kokkos::deep_copy( errors_a, 0 );
+    errors = errors_a;
+
+    Kokkos::parallel_scan( N , *this );
+  }
+
+  TestScan( const WorkSpec & Start , const WorkSpec & N )
+  {
+    typedef Kokkos::RangePolicy< execution_space > exec_policy ;
+
+    Kokkos::View< int, Device > errors_a( "Errors" );
+    Kokkos::deep_copy( errors_a, 0 );
+    errors = errors_a;
+
+    Kokkos::parallel_scan( exec_policy( Start , N ) , *this );
+  }
+
+  static void test_range( const WorkSpec & begin, const WorkSpec & end )
+  {
+    for ( WorkSpec i = begin; i < end; ++i ) {
+      (void) TestScan( i );
+    }
+  }
+};
+
+TEST_F( TEST_CATEGORY, scan )
+{
+  TestScan< TEST_EXECSPACE >::test_range( 1, 1000 );
+  TestScan< TEST_EXECSPACE >( 0 );
+  TestScan< TEST_EXECSPACE >( 100000 );
+  TestScan< TEST_EXECSPACE >( 10000000 );
+  TEST_EXECSPACE::fence();
+}
+
+
+/*TEST_F( TEST_CATEGORY, scan_small )
+{
+  typedef TestScan< TEST_EXECSPACE, Kokkos::Impl::ThreadsExecUseScanSmall > TestScanFunctor;
+
+  for ( int i = 0; i < 1000; ++i ) {
+    TestScanFunctor( 10 );
+    TestScanFunctor( 10000 );
+  }
+  TestScanFunctor( 1000000 );
+  TestScanFunctor( 10000000 );
+
+  TEST_EXECSPACE::fence();
+}*/
+
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestSharedAlloc.hpp b/packages/kokkos/core/unit_test/TestSharedAlloc.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1dc52f0fd1aada419ee4397dea28dc3396a235ad
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestSharedAlloc.hpp
@@ -0,0 +1,210 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+struct SharedAllocDestroy {
+  volatile int * count;
+
+  SharedAllocDestroy() = default;
+  SharedAllocDestroy( int * arg ) : count( arg ) {}
+
+  void destroy_shared_allocation()
+  {
+    Kokkos::atomic_increment( count );
+  }
+};
+
+template< class MemorySpace, class ExecutionSpace >
+void test_shared_alloc()
+{
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  typedef const Kokkos::Impl::SharedAllocationHeader                               Header;
+  typedef Kokkos::Impl::SharedAllocationTracker                                    Tracker;
+  typedef Kokkos::Impl::SharedAllocationRecord< void, void >                       RecordBase;
+  typedef Kokkos::Impl::SharedAllocationRecord< MemorySpace, void >                RecordMemS;
+  typedef Kokkos::Impl::SharedAllocationRecord< MemorySpace, SharedAllocDestroy >  RecordFull;
+
+  static_assert( sizeof( Tracker ) == sizeof( int* ), "SharedAllocationTracker has wrong size!" );
+
+  MemorySpace s;
+
+  const size_t N = 1200;
+  const size_t size = 8;
+
+  RecordMemS * rarray[ N ];
+  Header     * harray[ N ];
+
+  RecordMemS ** const r = rarray;
+  Header     ** const h = harray;
+
+  Kokkos::RangePolicy< ExecutionSpace > range( 0, N );
+
+  {
+    // Since always executed on host space, leave [=]
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      char name[64];
+      sprintf( name, "test_%.2d", int( i ) );
+
+      r[i] = RecordMemS::allocate( s, name, size * ( i + 1 ) );
+      h[i] = Header::get_header( r[i]->data() );
+
+      ASSERT_EQ( r[i]->use_count(), 0 );
+
+      for ( size_t j = 0; j < ( i / 10 ) + 1; ++j ) RecordBase::increment( r[i] );
+
+      ASSERT_EQ( r[i]->use_count(), ( i / 10 ) + 1 );
+      ASSERT_EQ( r[i], RecordMemS::get_record( r[i]->data() ) );
+    });
+
+    // Sanity check for the whole set of allocation records to which this record belongs.
+    RecordBase::is_sane( r[0] );
+    // RecordMemS::print_records( std::cout, s, true );
+
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      while ( 0 != ( r[i] = static_cast< RecordMemS * >( RecordBase::decrement( r[i] ) ) ) ) {
+        if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] );
+      }
+    });
+  }
+
+  {
+    int destroy_count = 0;
+    SharedAllocDestroy counter( &destroy_count );
+
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      char name[64];
+      sprintf( name, "test_%.2d", int( i ) );
+
+      RecordFull * rec = RecordFull::allocate( s, name, size * ( i + 1 ) );
+
+      rec->m_destroy = counter;
+
+      r[i] = rec;
+      h[i] = Header::get_header( r[i]->data() );
+
+      ASSERT_EQ( r[i]->use_count(), 0 );
+
+      for ( size_t j = 0; j < ( i / 10 ) + 1; ++j ) RecordBase::increment( r[i] );
+
+      ASSERT_EQ( r[i]->use_count(), ( i / 10 ) + 1 );
+      ASSERT_EQ( r[i], RecordMemS::get_record( r[i]->data() ) );
+    });
+
+    RecordBase::is_sane( r[0] );
+
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      while ( 0 != ( r[i] = static_cast< RecordMemS * >( RecordBase::decrement( r[i] ) ) ) ) {
+        if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] );
+      }
+    });
+
+    ASSERT_EQ( destroy_count, int( N ) );
+  }
+
+  {
+    int destroy_count = 0;
+
+    {
+      RecordFull * rec = RecordFull::allocate( s, "test", size );
+
+      // ... Construction of the allocated { rec->data(), rec->size() }
+
+      // Copy destruction function object into the allocation record.
+      rec->m_destroy = SharedAllocDestroy( & destroy_count );
+
+      ASSERT_EQ( rec->use_count(), 0 );
+
+      // Start tracking, increments the use count from 0 to 1.
+      Tracker track;
+
+      track.assign_allocated_record_to_uninitialized( rec );
+
+      ASSERT_EQ( rec->use_count(), 1 );
+      ASSERT_EQ( track.use_count(), 1 );
+
+      // Verify construction / destruction increment.
+      for ( size_t i = 0; i < N; ++i ) {
+        ASSERT_EQ( rec->use_count(), 1 );
+
+        {
+          Tracker local_tracker;
+          local_tracker.assign_allocated_record_to_uninitialized( rec );
+          ASSERT_EQ( rec->use_count(), 2 );
+          ASSERT_EQ( local_tracker.use_count(), 2 );
+        }
+
+        ASSERT_EQ( rec->use_count(), 1 );
+        ASSERT_EQ( track.use_count(), 1 );
+      }
+
+      Kokkos::parallel_for( range, [=] ( size_t i ) {
+        Tracker local_tracker;
+        local_tracker.assign_allocated_record_to_uninitialized( rec );
+        ASSERT_GT( rec->use_count(), 1 );
+      });
+
+      ASSERT_EQ( rec->use_count(), 1 );
+      ASSERT_EQ( track.use_count(), 1 );
+
+      // Destruction of 'track' object deallocates the 'rec' and invokes the destroy function object.
+    }
+
+    ASSERT_EQ( destroy_count, 1 );
+  }
+
+#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
+
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestTaskScheduler.hpp b/packages/kokkos/core/unit_test/TestTaskScheduler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f080c0d3be2c186ba5198be2a9284af3b2594116
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestTaskScheduler.hpp
@@ -0,0 +1,667 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP
+#define KOKKOS_UNITTEST_TASKSCHEDULER_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_TASKDAG )
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <iostream>
+#include <cmath>
+
+
+namespace TestTaskScheduler {
+
+namespace {
+
+inline
+long eval_fib( long n )
+{
+  constexpr long mask = 0x03;
+
+  long fib[4] = { 0, 1, 1, 2 };
+
+  for ( long i = 2; i <= n; ++i ) {
+    fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ];
+  }
+
+  return fib[ n & mask ];
+}
+
+}
+
+template< typename Space >
+struct TestFib
+{
+  typedef Kokkos::TaskScheduler< Space >  sched_type;
+  typedef Kokkos::Future< long, Space >   future_type;
+  typedef long                            value_type;
+
+  sched_type  sched;
+  future_type fib_m1;
+  future_type fib_m2;
+  const value_type n;
+
+  KOKKOS_INLINE_FUNCTION
+  TestFib( const sched_type & arg_sched, const value_type arg_n )
+    : sched( arg_sched ), fib_m1(), fib_m2(), n( arg_n ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename sched_type::member_type &, value_type & result )
+  {
+#if 0
+    printf( "\nTestFib(%ld) %d %d\n", n, int( !fib_m1.is_null() ), int( !fib_m2.is_null() ) );
+#endif
+
+    if ( n < 2 ) {
+      result = n;
+    }
+    else if ( !fib_m2.is_null() && !fib_m1.is_null() ) {
+      result = fib_m1.get() + fib_m2.get();
+    }
+    else {
+      // Spawn new children and respawn myself to sum their results.
+      // Spawn lower value at higher priority as it has a shorter
+      // path to completion.
+
+      fib_m2 = Kokkos::task_spawn( Kokkos::TaskSingle( sched, Kokkos::TaskPriority::High )
+                                 , TestFib( sched, n - 2 ) );
+
+      fib_m1 = Kokkos::task_spawn( Kokkos::TaskSingle( sched )
+                                 , TestFib( sched, n - 1 ) );
+
+      Kokkos::Future< Space > dep[] = { fib_m1, fib_m2 };
+      Kokkos::Future< Space > fib_all = Kokkos::when_all( dep, 2 );
+
+      if ( !fib_m2.is_null() && !fib_m1.is_null() && !fib_all.is_null() ) {
+        // High priority to retire this branch.
+        Kokkos::respawn( this, fib_all, Kokkos::TaskPriority::High );
+      }
+      else {
+#if 1
+        printf( "TestFib(%ld) insufficient memory alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
+               , n
+               , sched.allocation_capacity()
+               , sched.allocated_task_count_max()
+               , sched.allocated_task_count_accum()
+               );
+#endif
+
+        Kokkos::abort( "TestFib insufficient memory" );
+
+      }
+    }
+  }
+
+  static void run( int i, size_t MemoryCapacity = 16000 )
+  {
+    typedef typename sched_type::memory_space memory_space;
+
+    enum { MinBlockSize   =   64 };
+    enum { MaxBlockSize   = 1024 };
+    enum { SuperBlockSize = 4096 };
+
+    sched_type root_sched( memory_space()
+                         , MemoryCapacity
+                         , MinBlockSize
+                         , std::min(size_t(MaxBlockSize),MemoryCapacity)
+                         , std::min(size_t(SuperBlockSize),MemoryCapacity) );
+
+    future_type f = Kokkos::host_spawn( Kokkos::TaskSingle( root_sched )
+                                      , TestFib( root_sched, i ) );
+
+    Kokkos::wait( root_sched );
+
+    ASSERT_EQ( eval_fib( i ), f.get() );
+
+#if 0
+    fprintf( stdout, "\nTestFib::run(%d) spawn_size(%d) when_all_size(%d) alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
+           , i
+           , int(root_sched.template spawn_allocation_size<TestFib>())
+           , int(root_sched.when_all_allocation_size(2))
+           , root_sched.allocation_capacity()
+           , root_sched.allocated_task_count_max()
+           , root_sched.allocated_task_count_accum()
+           );
+    fflush( stdout );
+#endif
+  }
+};
+
+} // namespace TestTaskScheduler
+
+//----------------------------------------------------------------------------
+
+namespace TestTaskScheduler {
+
+template< class Space >
+struct TestTaskSpawn {
+  typedef Kokkos::TaskScheduler< Space >  sched_type;
+  typedef Kokkos::Future< Space >         future_type;
+  typedef void                            value_type;
+
+  sched_type   m_sched ;
+  future_type  m_future ;
+
+  KOKKOS_INLINE_FUNCTION
+  TestTaskSpawn( const sched_type & arg_sched
+               , const future_type & arg_future
+               )
+    : m_sched( arg_sched )
+    , m_future( arg_future )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename sched_type::member_type & )
+  {
+    if ( ! m_future.is_null() ) {
+      Kokkos::task_spawn( Kokkos::TaskSingle( m_sched ) , TestTaskSpawn( m_sched , future_type() ) );
+    }
+  }
+
+  static void run()
+  {
+    typedef typename sched_type::memory_space memory_space;
+
+    enum { MemoryCapacity = 16000 };
+    enum { MinBlockSize   =   64 };
+    enum { MaxBlockSize   = 1024 };
+    enum { SuperBlockSize = 4096 };
+
+    sched_type sched( memory_space()
+                    , MemoryCapacity
+                    , MinBlockSize
+                    , MaxBlockSize
+                    , SuperBlockSize );
+
+    auto f = Kokkos::host_spawn( Kokkos::TaskSingle( sched ), TestTaskSpawn( sched, future_type() ) );
+    Kokkos::host_spawn( Kokkos::TaskSingle( f ), TestTaskSpawn( sched, f ) );
+
+    Kokkos::wait( sched );
+  }
+};
+
+template< class Space >
+struct TestTaskDependence {
+  typedef Kokkos::TaskScheduler< Space >  sched_type;
+  typedef Kokkos::Future< Space >         future_type;
+  typedef Kokkos::View< long, Space >     accum_type;
+  typedef void                            value_type;
+
+  sched_type  m_sched;
+  accum_type  m_accum;
+  long        m_count;
+
+  KOKKOS_INLINE_FUNCTION
+  TestTaskDependence( long n
+                    , const sched_type & arg_sched
+                    , const accum_type & arg_accum )
+    : m_sched( arg_sched )
+    , m_accum( arg_accum )
+    , m_count( n ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename sched_type::member_type & )
+  {
+    enum { CHUNK = 8 };
+    const int n = CHUNK < m_count ? CHUNK : m_count;
+
+    if ( 1 < m_count ) {
+
+      const int increment = ( m_count + n - 1 ) / n;
+
+      future_type f =
+        m_sched.when_all( n , [this,increment]( int i ) {
+          const long inc   = increment ;
+          const long begin = i * inc ;
+          const long count = begin + inc < m_count ? inc : m_count - begin ;
+
+          return Kokkos::task_spawn
+            ( Kokkos::TaskSingle( m_sched )
+            , TestTaskDependence( count, m_sched, m_accum ) );
+        });
+
+      m_count = 0;
+
+      Kokkos::respawn( this, f );
+    }
+    else if ( 1 == m_count ) {
+      Kokkos::atomic_increment( & m_accum() );
+    }
+  }
+
+  static void run( int n )
+  {
+    typedef typename sched_type::memory_space memory_space;
+
+    enum { MemoryCapacity = 16000 };
+    enum { MinBlockSize   =   64 };
+    enum { MaxBlockSize   = 1024 };
+    enum { SuperBlockSize = 4096 };
+
+    sched_type sched( memory_space()
+                    , MemoryCapacity
+                    , MinBlockSize
+                    , MaxBlockSize
+                    , SuperBlockSize );
+
+    accum_type accum( "accum" );
+
+    typename accum_type::HostMirror host_accum = Kokkos::create_mirror_view( accum );
+
+    Kokkos::host_spawn( Kokkos::TaskSingle( sched ), TestTaskDependence( n, sched, accum ) );
+
+    Kokkos::wait( sched );
+
+    Kokkos::deep_copy( host_accum, accum );
+
+    ASSERT_EQ( host_accum(), n );
+  }
+};
+
+} // namespace TestTaskScheduler
+
+//----------------------------------------------------------------------------
+
+namespace TestTaskScheduler {
+
+template< class ExecSpace >
+struct TestTaskTeam {
+  //enum { SPAN = 8 };
+  enum { SPAN = 33 };
+  //enum { SPAN = 1 };
+
+  typedef void                                value_type;
+  typedef Kokkos::TaskScheduler< ExecSpace >  sched_type;
+  typedef Kokkos::Future< ExecSpace >         future_type;
+  typedef Kokkos::View< long*, ExecSpace >    view_type;
+
+  sched_type   sched;
+  future_type  future;
+
+  view_type   parfor_result;
+  view_type   parreduce_check;
+  view_type   parscan_result;
+  view_type   parscan_check;
+  const long  nvalue;
+
+  KOKKOS_INLINE_FUNCTION
+  TestTaskTeam( const sched_type & arg_sched
+              , const view_type  & arg_parfor_result
+              , const view_type  & arg_parreduce_check
+              , const view_type  & arg_parscan_result
+              , const view_type  & arg_parscan_check
+              , const long         arg_nvalue )
+    : sched( arg_sched )
+    , future()
+    , parfor_result( arg_parfor_result )
+    , parreduce_check( arg_parreduce_check )
+    , parscan_result( arg_parscan_result )
+    , parscan_check( arg_parscan_check )
+    , nvalue( arg_nvalue ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename sched_type::member_type & member )
+  {
+    const long end   = nvalue + 1;
+    const long begin = 0 < end - SPAN ? end - SPAN : 0;
+
+    if ( 0 < begin && future.is_null() ) {
+      if ( member.team_rank() == 0 ) {
+        future = Kokkos::task_spawn( Kokkos::TaskTeam( sched )
+                                   , TestTaskTeam( sched
+                                                 , parfor_result
+                                                 , parreduce_check
+                                                 , parscan_result
+                                                 , parscan_check
+                                                 , begin - 1 )
+                                   );
+
+        #ifndef __HCC_ACCELERATOR__
+        assert( !future.is_null() );
+        #endif
+
+        Kokkos::respawn( this, future );
+      }
+
+      return;
+    }
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { parfor_result[i] = i; }
+                        );
+
+    // Test parallel_reduce without join.
+
+    long tot = 0;
+    long expected = ( begin + end - 1 ) * ( end - begin ) * 0.5;
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member, begin, end )
+                           , [&] ( int i, long & res ) { res += parfor_result[i]; }
+                           , tot
+                           );
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { parreduce_check[i] = expected - tot; }
+                        );
+
+    // Test parallel_reduce with join.
+
+    tot = 0;
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member, begin, end )
+                           , [&] ( int i, long & res ) { res += parfor_result[i]; }
+                           , Kokkos::Experimental::Sum<long>( tot )
+                           );
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { parreduce_check[i] += expected - tot; }
+                        );
+
+    // Test parallel_scan.
+
+    // Exclusive scan.
+    Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange( member, begin, end )
+                               , [&] ( int i, long & val, const bool final )
+    {
+      if ( final ) { parscan_result[i] = val; }
+
+      val += i;
+    });
+
+    // Wait for 'parscan_result' before testing it.
+    member.team_barrier();
+
+    if ( member.team_rank() == 0 ) {
+      for ( long i = begin; i < end; ++i ) {
+        parscan_check[i] = ( i * ( i - 1 ) - begin * ( begin - 1 ) ) * 0.5 - parscan_result[i];
+      }
+    }
+
+    // Don't overwrite 'parscan_result' until it has been tested.
+    member.team_barrier();
+
+    // Inclusive scan.
+    Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange( member, begin, end )
+                               , [&] ( int i, long & val, const bool final )
+    {
+      val += i;
+
+      if ( final ) { parscan_result[i] = val; }
+    });
+
+    // Wait for 'parscan_result' before testing it.
+    member.team_barrier();
+
+    if ( member.team_rank() == 0 ) {
+      for ( long i = begin; i < end; ++i ) {
+        parscan_check[i] += ( i * ( i + 1 ) - begin * ( begin - 1 ) ) * 0.5 - parscan_result[i];
+      }
+    }
+
+    // ThreadVectorRange check.
+/*
+    long result = 0;
+    expected = ( begin + end - 1 ) * ( end - begin ) * 0.5;
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member, 0, 1 )
+                           , [&] ( const int i, long & outerUpdate )
+    {
+      long sum_j = 0.0;
+
+      Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( member, end - begin )
+                             , [&] ( const int j, long & innerUpdate )
+      {
+        innerUpdate += begin + j;
+      }, sum_j );
+
+      outerUpdate += sum_j;
+    }, result );
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i )
+    {
+      parreduce_check[i] += result - expected;
+    });
+*/
+
+  }
+
+  static void run( long n )
+  {
+    const unsigned memory_capacity = 400000;
+
+    enum { MinBlockSize   =   64 };
+    enum { MaxBlockSize   = 1024 };
+    enum { SuperBlockSize = 4096 };
+
+    sched_type root_sched( typename sched_type::memory_space()
+                         , memory_capacity
+                         , MinBlockSize
+                         , MaxBlockSize
+                         , SuperBlockSize );
+
+    view_type root_parfor_result( "parfor_result", n + 1 );
+    view_type root_parreduce_check( "parreduce_check", n + 1 );
+    view_type root_parscan_result( "parscan_result", n + 1 );
+    view_type root_parscan_check( "parscan_check", n + 1 );
+
+    typename view_type::HostMirror
+      host_parfor_result = Kokkos::create_mirror_view( root_parfor_result );
+    typename view_type::HostMirror
+      host_parreduce_check = Kokkos::create_mirror_view( root_parreduce_check );
+    typename view_type::HostMirror
+      host_parscan_result = Kokkos::create_mirror_view( root_parscan_result );
+    typename view_type::HostMirror
+      host_parscan_check = Kokkos::create_mirror_view( root_parscan_check );
+
+    future_type f = Kokkos::host_spawn( Kokkos::TaskTeam( root_sched )
+                                      , TestTaskTeam( root_sched
+                                                    , root_parfor_result
+                                                    , root_parreduce_check
+                                                    , root_parscan_result
+                                                    , root_parscan_check
+                                                    , n )
+                                      );
+
+    Kokkos::wait( root_sched );
+
+    Kokkos::deep_copy( host_parfor_result, root_parfor_result );
+    Kokkos::deep_copy( host_parreduce_check, root_parreduce_check );
+    Kokkos::deep_copy( host_parscan_result, root_parscan_result );
+    Kokkos::deep_copy( host_parscan_check, root_parscan_check );
+
+    long error_count = 0 ;
+
+    for ( long i = 0; i <= n; ++i ) {
+      const long answer = i;
+
+      if ( host_parfor_result( i ) != answer ) {
+        ++error_count ;
+        std::cerr << "TestTaskTeam::run ERROR parallel_for result(" << i << ") = "
+                  << host_parfor_result( i ) << " != " << answer << std::endl;
+      }
+
+      if ( host_parreduce_check( i ) != 0 ) {
+        ++error_count ;
+        std::cerr << "TestTaskTeam::run ERROR parallel_reduce check(" << i << ") = "
+                  << host_parreduce_check( i ) << " != 0" << std::endl;
+      }
+
+      if ( host_parscan_check( i ) != 0 ) {
+        ++error_count ;
+        std::cerr << "TestTaskTeam::run ERROR parallel_scan check(" << i << ") = "
+                  << host_parscan_check( i ) << " != 0" << std::endl;
+      }
+    }
+
+    ASSERT_EQ( 0L , error_count );
+  }
+};
+
+template< class ExecSpace >
+struct TestTaskTeamValue {
+  enum { SPAN = 8 };
+
+  typedef long                                     value_type;
+  typedef Kokkos::TaskScheduler< ExecSpace >       sched_type;
+  typedef Kokkos::Future< value_type, ExecSpace >  future_type;
+  typedef Kokkos::View< long*, ExecSpace >         view_type;
+
+  sched_type   sched;
+  future_type  future;
+
+  view_type   result;
+  const long  nvalue;
+
+  KOKKOS_INLINE_FUNCTION
+  TestTaskTeamValue( const sched_type & arg_sched
+                   , const view_type  & arg_result
+                   , const long         arg_nvalue )
+    : sched( arg_sched )
+    , future()
+    , result( arg_result )
+    , nvalue( arg_nvalue ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename sched_type::member_type const & member
+                 , value_type & final )
+  {
+    const long end   = nvalue + 1;
+    const long begin = 0 < end - SPAN ? end - SPAN : 0;
+
+    if ( 0 < begin && future.is_null() ) {
+      if ( member.team_rank() == 0 ) {
+        future = sched.task_spawn( TestTaskTeamValue( sched, result, begin - 1 )
+                                 , Kokkos::TaskTeam );
+
+        assert( !future.is_null() );
+
+        sched.respawn( this , future );
+      }
+
+      return;
+    }
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { result[i] = i + 1; }
+                        );
+
+    if ( member.team_rank() == 0 ) {
+      final = result[nvalue];
+    }
+
+    Kokkos::memory_fence();
+  }
+
+  static void run( long n )
+  {
+    const unsigned memory_capacity = 100000;
+
+    enum { MinBlockSize   =   64 };
+    enum { MaxBlockSize   = 1024 };
+    enum { SuperBlockSize = 4096 };
+
+    sched_type root_sched( typename sched_type::memory_space()
+                         , memory_capacity
+                         , MinBlockSize
+                         , MaxBlockSize
+                         , SuperBlockSize );
+
+    view_type root_result( "result", n + 1 );
+
+    typename view_type::HostMirror host_result = Kokkos::create_mirror_view( root_result );
+
+    future_type fv = root_sched.host_spawn( TestTaskTeamValue( root_sched, root_result, n )
+                                          , Kokkos::TaskTeam );
+
+    Kokkos::wait( root_sched );
+
+    Kokkos::deep_copy( host_result, root_result );
+
+    if ( fv.get() != n + 1 ) {
+      std::cerr << "TestTaskTeamValue ERROR future = "
+                << fv.get() << " != " << n + 1 << std::endl;
+    }
+
+    for ( long i = 0; i <= n; ++i ) {
+      const long answer = i + 1;
+
+      if ( host_result( i ) != answer ) {
+        std::cerr << "TestTaskTeamValue ERROR result(" << i << ") = "
+                  << host_result( i ) << " != " << answer << std::endl;
+      }
+    }
+  }
+};
+
+} // namespace TestTaskScheduler
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, task_fib )
+{
+  const int N = 27 ;
+  for ( int i = 0; i < N; ++i ) {
+    TestTaskScheduler::TestFib< TEST_EXECSPACE >::run( i , ( i + 1 ) * ( i + 1 ) * 2000 );
+  }
+}
+
+TEST_F( TEST_CATEGORY, task_depend )
+{
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< TEST_EXECSPACE >::run( i );
+  }
+}
+
+TEST_F( TEST_CATEGORY, task_team )
+{
+  TestTaskScheduler::TestTaskTeam< TEST_EXECSPACE >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< TEST_EXECSPACE >::run( 1000 ); // Put back after testing.
+}
+
+}
+
+#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
+#endif // #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP
+
diff --git a/packages/kokkos/core/unit_test/TestTeam.hpp b/packages/kokkos/core/unit_test/TestTeam.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..51f70737ed4fc347b9dd16c1272450c5a61e6aa5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestTeam.hpp
@@ -0,0 +1,972 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace {
+
+template< class ExecSpace, class ScheduleType >
+struct TestTeamPolicy {
+  typedef typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type team_member;
+  typedef Kokkos::View< int**, ExecSpace > view_type;
+
+  view_type m_flags;
+
+  TestTeamPolicy( const size_t league_size )
+    : m_flags( Kokkos::ViewAllocateWithoutInitializing( "flags" ),
+               Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( *this ),
+               league_size ) {}
+
+  struct VerifyInitTag {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_member & member ) const
+  {
+    const int tid = member.team_rank() + member.team_size() * member.league_rank();
+
+    m_flags( member.team_rank(), member.league_rank() ) = tid;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const VerifyInitTag &, const team_member & member ) const
+  {
+    const int tid = member.team_rank() + member.team_size() * member.league_rank();
+
+    if ( tid != m_flags( member.team_rank(), member.league_rank() ) ) {
+      printf( "TestTeamPolicy member(%d,%d) error %d != %d\n",
+               member.league_rank(), member.team_rank(),
+               tid, m_flags( member.team_rank(), member.league_rank() ) );
+    }
+  }
+
+  // Included for test_small_league_size.
+  TestTeamPolicy() : m_flags() {}
+
+  // Included for test_small_league_size.
+  struct NoOpTag {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const NoOpTag &, const team_member & member ) const {}
+
+
+  static void test_small_league_size() {
+    int bs = 8; // batch size (number of elements per batch)
+    int ns = 16; // total number of "problems" to process
+
+    // Calculate total scratch memory space size.
+    const int level = 0;
+    int mem_size = 960;
+    const int num_teams = ns / bs;
+    const Kokkos::TeamPolicy< ExecSpace, NoOpTag > policy( num_teams, Kokkos::AUTO() );
+
+    Kokkos::parallel_for( policy.set_scratch_size( level, Kokkos::PerTeam( mem_size ), Kokkos::PerThread( 0 ) ),
+                          TestTeamPolicy() );
+  }
+
+  static void test_for( const size_t league_size )
+  {
+    TestTeamPolicy functor( league_size );
+
+    const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
+
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace >( league_size, team_size ), functor );
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace, VerifyInitTag >( league_size, team_size ), functor );
+
+    test_small_league_size();
+  }
+
+  struct ReduceTag {};
+
+  typedef long value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_member & member, value_type & update ) const
+  {
+    update += member.team_rank() + member.team_size() * member.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const ReduceTag &, const team_member & member, value_type & update ) const
+  {
+    update += 1 + member.team_rank() + member.team_size() * member.league_rank();
+  }
+
+  static void test_reduce( const size_t league_size )
+  {
+    TestTeamPolicy functor( league_size );
+
+    const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
+    const long N = team_size * league_size;
+
+    long total = 0;
+
+    Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType, ExecSpace >( league_size, team_size ), functor, total );
+    ASSERT_EQ( size_t( ( N - 1 ) * ( N ) ) / 2, size_t( total ) );
+
+    Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType, ExecSpace, ReduceTag >( league_size, team_size ), functor, total );
+    ASSERT_EQ( ( size_t( N ) * size_t( N + 1 ) ) / 2, size_t( total ) );
+  }
+};
+
+} // namespace
+
+} // namespace Test
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< typename ScalarType, class DeviceType, class ScheduleType >
+class ReduceTeamFunctor
+{
+public:
+  typedef DeviceType                                           execution_space;
+  typedef Kokkos::TeamPolicy< ScheduleType, execution_space >  policy_type;
+  typedef typename execution_space::size_type                  size_type;
+
+  struct value_type {
+    ScalarType value[3];
+  };
+
+  const size_type nwork;
+
+  ReduceTeamFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {}
+
+  ReduceTeamFunctor( const ReduceTeamFunctor & rhs ) : nwork( rhs.nwork ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & dst ) const
+  {
+    dst.value[0] = 0;
+    dst.value[1] = 0;
+    dst.value[2] = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  {
+    dst.value[0] += src.value[0];
+    dst.value[1] += src.value[1];
+    dst.value[2] += src.value[2];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const typename policy_type::member_type ind, value_type & dst ) const
+  {
+    const int thread_rank = ind.team_rank() + ind.team_size() * ind.league_rank();
+    const int thread_size = ind.team_size() * ind.league_size();
+    const int chunk = ( nwork + thread_size - 1 ) / thread_size;
+
+    size_type iwork = chunk * thread_rank;
+    const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork;
+
+    for ( ; iwork < iwork_end; ++iwork ) {
+      dst.value[0] += 1;
+      dst.value[1] += iwork + 1;
+      dst.value[2] += nwork - iwork;
+    }
+  }
+};
+
+} // namespace Test
+
+namespace {
+
+template< typename ScalarType, class DeviceType, class ScheduleType >
+class TestReduceTeam
+{
+public:
+  typedef DeviceType                                            execution_space;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
+  typedef typename execution_space::size_type                   size_type;
+
+  TestReduceTeam( const size_type & nwork ) { run_test( nwork ); }
+
+  void run_test( const size_type & nwork )
+  {
+    typedef Test::ReduceTeamFunctor< ScalarType, execution_space, ScheduleType> functor_type;
+    typedef typename functor_type::value_type value_type;
+    typedef Kokkos::View< value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    value_type result[ Repeat ];
+
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
+
+    const unsigned team_size   = policy_type::team_size_recommended( functor_type( nwork ) );
+    const unsigned league_size = ( nwork + team_size - 1 ) / team_size;
+
+    policy_type team_exec( league_size, team_size );
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      result_type tmp( & result[i] );
+      Kokkos::parallel_reduce( team_exec, functor_type( nwork ), tmp );
+    }
+
+    execution_space::fence();
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, result[i].value[j] );
+      }
+    }
+  }
+};
+
+} // namespace
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class DeviceType, class ScheduleType >
+class ScanTeamFunctor
+{
+public:
+  typedef DeviceType                                            execution_space;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
+  typedef long int                                              value_type;
+
+  Kokkos::View< value_type, execution_space > accum;
+  Kokkos::View< value_type, execution_space > total;
+
+  ScanTeamFunctor() : accum( "accum" ), total( "total" ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & error ) const { error = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type volatile & error, value_type volatile const & input ) const
+  { if ( input ) error = 1; }
+
+  struct JoinMax {
+    typedef long int value_type;
+
+    KOKKOS_INLINE_FUNCTION
+    void join( value_type volatile & dst, value_type volatile const & input ) const
+    { if ( dst < input ) dst = input; }
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const typename policy_type::member_type ind, value_type & error ) const
+  {
+    if ( 0 == ind.league_rank() && 0 == ind.team_rank() ) {
+      const long int thread_count = ind.league_size() * ind.team_size();
+      total() = ( thread_count * ( thread_count + 1 ) ) / 2;
+    }
+
+    // Team max:
+    int long m = (long int) ( ind.league_rank() + ind.team_rank() );
+    ind.team_reduce(  Kokkos::Experimental::Max<int long>(m) );
+
+    if ( m != ind.league_rank() + ( ind.team_size() - 1 ) ) {
+      printf( "ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n",
+               ind.league_rank(), ind.team_rank(),
+               ind.league_size(), ind.team_size(),
+               (long int) ( ind.league_rank() + ( ind.team_size() - 1 ) ), m );
+    }
+
+    // Scan:
+    const long int answer =
+      ( ind.league_rank() + 1 ) * ind.team_rank() + ( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2;
+
+    const long int result =
+      ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
+
+    const long int result2 =
+      ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
+
+    if ( answer != result || answer != result2 ) {
+      printf( "ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n",
+              ind.league_rank(), ind.team_rank(),
+              ind.league_size(), ind.team_size(),
+              answer, result, result2 );
+
+      error = 1;
+    }
+
+    const long int thread_rank = ind.team_rank() +
+                                 ind.team_size() * ind.league_rank();
+    ind.team_scan( 1 + thread_rank, accum.data() );
+  }
+};
+
+template< class DeviceType, class ScheduleType >
+class TestScanTeam
+{
+public:
+  typedef DeviceType                                            execution_space;
+  typedef long int                                              value_type;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
+  typedef Test::ScanTeamFunctor<DeviceType, ScheduleType>       functor_type;
+
+  TestScanTeam( const size_t nteam ) { run_test( nteam ); }
+
+  void run_test( const size_t nteam )
+  {
+    typedef Kokkos::View< long int, Kokkos::HostSpace, Kokkos::MemoryUnmanaged >  result_type;
+
+    const unsigned REPEAT = 100000;
+    unsigned Repeat;
+
+    if ( nteam == 0 ) {
+      Repeat = 1;
+    }
+    else {
+      Repeat = ( REPEAT + nteam - 1 ) / nteam; // Error here.
+    }
+
+    functor_type functor;
+
+    policy_type team_exec( nteam, policy_type::team_size_max( functor ) );
+
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      long int accum = 0;
+      long int total = 0;
+      long int error = 0;
+      Kokkos::deep_copy( functor.accum, total );
+
+      Kokkos::parallel_reduce( team_exec, functor, result_type( & error ) );
+      DeviceType::fence();
+
+      Kokkos::deep_copy( accum, functor.accum );
+      Kokkos::deep_copy( total, functor.total );
+
+      ASSERT_EQ( error, 0 );
+      ASSERT_EQ( total, accum );
+    }
+
+    execution_space::fence();
+  }
+};
+
+} // namespace Test
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class ExecSpace, class ScheduleType >
+struct SharedTeamFunctor {
+
+  typedef ExecSpace                                             execution_space;
+  typedef int                                                   value_type;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
+
+  enum { SHARED_COUNT = 1000 };
+
+  typedef typename ExecSpace::scratch_memory_space  shmem_space;
+
+  // TBD: MemoryUnmanaged should be the default for shared memory space.
+  typedef Kokkos::View< int*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
+
+  // Tell how much shared memory will be required by this functor.
+  inline
+  unsigned team_shmem_size( int team_size ) const
+  {
+    return shared_int_array_type::shmem_size( SHARED_COUNT ) +
+           shared_int_array_type::shmem_size( SHARED_COUNT );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const typename policy_type::member_type & ind, value_type & update ) const
+  {
+    const shared_int_array_type shared_A( ind.team_shmem(), SHARED_COUNT );
+    const shared_int_array_type shared_B( ind.team_shmem(), SHARED_COUNT );
+
+    if ( ( shared_A.data() == nullptr && SHARED_COUNT > 0 ) ||
+         ( shared_B.data() == nullptr && SHARED_COUNT > 0 ) )
+    {
+      printf ("member( %d/%d , %d/%d ) Failed to allocate shared memory of size %lu\n"
+             , ind.league_rank()
+             , ind.league_size()
+             , ind.team_rank()
+             , ind.team_size()
+             , static_cast<unsigned long>( SHARED_COUNT )
+             );
+
+      ++update; // Failure to allocate is an error.
+    }
+    else {
+      for ( int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size() ) {
+        shared_A[i] = i + ind.league_rank();
+        shared_B[i] = 2 * i + ind.league_rank();
+      }
+
+      ind.team_barrier();
+
+      if ( ind.team_rank() + 1 == ind.team_size() ) {
+        for ( int i = 0; i < SHARED_COUNT; ++i ) {
+          if ( shared_A[i] != i + ind.league_rank() ) {
+            ++update;
+          }
+
+          if ( shared_B[i] != 2 * i + ind.league_rank() ) {
+            ++update;
+          }
+        }
+      }
+    }
+  }
+};
+
+} // namespace Test
+
+namespace {
+
+template< class ExecSpace, class ScheduleType >
+struct TestSharedTeam {
+  TestSharedTeam() { run(); }
+
+  void run()
+  {
+    typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor;
+    typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
+
+    const size_t team_size = Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( Functor() );
+
+    Kokkos::TeamPolicy< ScheduleType, ExecSpace > team_exec( 8192 / team_size, team_size );
+
+    typename Functor::value_type error_count = 0;
+
+    Kokkos::parallel_reduce( team_exec, Functor(), result_type( & error_count ) );
+
+    ASSERT_EQ( error_count, 0 );
+  }
+};
+
+} // namespace
+
+namespace Test {
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+template< class MemorySpace, class ExecSpace, class ScheduleType >
+struct TestLambdaSharedTeam {
+  TestLambdaSharedTeam() { run(); }
+
+  void run()
+  {
+    typedef Test::SharedTeamFunctor< ExecSpace, ScheduleType > Functor;
+    //typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
+    typedef Kokkos::View< typename Functor::value_type, MemorySpace, Kokkos::MemoryUnmanaged > result_type;
+
+    typedef typename ExecSpace::scratch_memory_space shmem_space;
+
+    // TBD: MemoryUnmanaged should be the default for shared memory space.
+    typedef Kokkos::View< int*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
+
+    const int SHARED_COUNT = 1000;
+    int team_size = 1;
+
+#ifdef KOKKOS_ENABLE_CUDA
+    if ( std::is_same< ExecSpace, Kokkos::Cuda >::value ) team_size = 128;
+#endif
+
+    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size, team_size );
+    team_exec = team_exec.set_scratch_size( 0, Kokkos::PerTeam( SHARED_COUNT * 2 * sizeof( int ) ) );
+
+    typename Functor::value_type error_count = 0;
+
+    Kokkos::parallel_reduce( team_exec, KOKKOS_LAMBDA
+        ( const typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type & ind, int & update )
+    {
+      const shared_int_array_type shared_A( ind.team_shmem(), SHARED_COUNT );
+      const shared_int_array_type shared_B( ind.team_shmem(), SHARED_COUNT );
+
+      if ( ( shared_A.data () == nullptr && SHARED_COUNT > 0 ) ||
+           ( shared_B.data () == nullptr && SHARED_COUNT > 0 ) )
+      {
+        printf( "Failed to allocate shared memory of size %lu\n",
+                static_cast<unsigned long>( SHARED_COUNT ) );
+
+        ++update; // Failure to allocate is an error.
+      }
+      else {
+        for ( int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size() ) {
+          shared_A[i] = i + ind.league_rank();
+          shared_B[i] = 2 * i + ind.league_rank();
+        }
+
+        ind.team_barrier();
+
+        if ( ind.team_rank() + 1 == ind.team_size() ) {
+          for ( int i = 0; i < SHARED_COUNT; ++i ) {
+            if ( shared_A[i] != i + ind.league_rank() ) {
+              ++update;
+            }
+
+            if ( shared_B[i] != 2 * i + ind.league_rank() ) {
+              ++update;
+            }
+          }
+        }
+      }
+    }, result_type( & error_count ) );
+
+    ASSERT_EQ( error_count, 0 );
+  }
+};
+#endif
+#endif
+
+} // namespace Test
+
+namespace Test {
+
+template< class ExecSpace, class ScheduleType >
+struct ScratchTeamFunctor {
+
+  typedef ExecSpace                                            execution_space;
+  typedef int                                                  value_type;
+  typedef Kokkos::TeamPolicy< ScheduleType, execution_space >  policy_type;
+
+  enum { SHARED_TEAM_COUNT = 100 };
+  enum { SHARED_THREAD_COUNT = 10 };
+
+  typedef typename ExecSpace::scratch_memory_space shmem_space;
+
+  // TBD: MemoryUnmanaged should be the default for shared memory space.
+  typedef Kokkos::View< size_t*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const typename policy_type::member_type & ind, value_type & update ) const
+  {
+    const shared_int_array_type scratch_ptr( ind.team_scratch( 1 ), 3 * ind.team_size() );
+    const shared_int_array_type scratch_A( ind.team_scratch( 1 ), SHARED_TEAM_COUNT );
+    const shared_int_array_type scratch_B( ind.thread_scratch( 1 ), SHARED_THREAD_COUNT );
+
+    if ( ( scratch_ptr.data() == nullptr ) ||
+         ( scratch_A.  data() == nullptr && SHARED_TEAM_COUNT > 0 ) ||
+         ( scratch_B.  data() == nullptr && SHARED_THREAD_COUNT > 0 ) )
+    {
+      printf( "Failed to allocate shared memory of size %lu\n",
+              static_cast<unsigned long>( SHARED_TEAM_COUNT ) );
+
+      ++update; // Failure to allocate is an error.
+    }
+    else {
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( ind, 0, (int) SHARED_TEAM_COUNT ), [&] ( const int & i ) {
+        scratch_A[i] = i + ind.league_rank();
+      });
+
+      for ( int i = 0; i < SHARED_THREAD_COUNT; i++ ) {
+        scratch_B[i] = 10000 * ind.league_rank() + 100 * ind.team_rank() + i;
+      }
+
+      scratch_ptr[ind.team_rank()] = (size_t) scratch_A.data();
+      scratch_ptr[ind.team_rank() + ind.team_size()] = (size_t) scratch_B.data();
+
+      ind.team_barrier();
+
+      for ( int i = 0; i < SHARED_TEAM_COUNT; i++ ) {
+        if ( scratch_A[i] != size_t( i + ind.league_rank() ) ) ++update;
+      }
+
+      for ( int i = 0; i < ind.team_size(); i++ ) {
+        if ( scratch_ptr[0] != scratch_ptr[i] ) ++update;
+      }
+
+      if ( scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] < SHARED_THREAD_COUNT * sizeof( size_t ) ) {
+        ++update;
+      }
+
+      for ( int i = 1; i < ind.team_size(); i++ ) {
+        if ( ( scratch_ptr[i + ind.team_size()] - scratch_ptr[i - 1 + ind.team_size()] ) !=
+             ( scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] ) )
+        {
+          ++update;
+        }
+      }
+    }
+  }
+};
+
+} // namespace Test
+
+namespace {
+
+template< class ExecSpace, class ScheduleType >
+struct TestScratchTeam {
+  TestScratchTeam() { run(); }
+
+  void run()
+  {
+    typedef Test::ScratchTeamFunctor<ExecSpace, ScheduleType> Functor;
+    typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged >  result_type;
+    typedef Kokkos::TeamPolicy< ScheduleType,  ExecSpace > p_type;
+
+    const size_t team_size = p_type::team_size_max( Functor() );
+
+    p_type team_exec( 8192 / team_size, team_size );
+
+    typename Functor::value_type error_count = 0;
+
+    int team_scratch_size   = Functor::shared_int_array_type::shmem_size( Functor::SHARED_TEAM_COUNT ) +
+                              Functor::shared_int_array_type::shmem_size( 3 * team_size );
+
+    int thread_scratch_size = Functor::shared_int_array_type::shmem_size( Functor::SHARED_THREAD_COUNT );
+
+    Kokkos::parallel_reduce( team_exec.set_scratch_size( 1, Kokkos::PerTeam( team_scratch_size ),
+                                                         Kokkos::PerThread( thread_scratch_size ) ),
+                             Functor(), result_type( & error_count ) );
+    ASSERT_EQ( error_count, 0 );
+
+    Kokkos::parallel_reduce( p_type( 8192 / team_size, team_size ,
+                                     Kokkos::ScratchRequest( 1, Kokkos::PerTeam( team_scratch_size ),
+                                                                Kokkos::PerThread( thread_scratch_size ))
+                                   ),
+                             Functor(), result_type( & error_count ) );
+    ASSERT_EQ( error_count, 0 );
+
+  }
+};
+
+} // namespace
+
+namespace Test {
+
+template< class ExecSpace >
+KOKKOS_INLINE_FUNCTION
+int test_team_mulit_level_scratch_loop_body( const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team ) {
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team1( team.team_scratch( 0 ), 128 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread1( team.thread_scratch( 0 ), 16 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team2( team.team_scratch( 0 ), 128 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread2( team.thread_scratch( 0 ), 16 );
+
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team1( team.team_scratch( 1 ), 128000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread1( team.thread_scratch( 1 ), 16000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team2( team.team_scratch( 1 ), 128000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread2( team.thread_scratch( 1 ), 16000 );
+
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team3( team.team_scratch( 0 ), 128 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread3( team.thread_scratch( 0 ), 16 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team3( team.team_scratch( 1 ), 128000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread3( team.thread_scratch( 1 ), 16000 );
+
+  // The explicit types for 0 and 128 are here to test TeamThreadRange accepting different
+  // types for begin and end.
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, int( 0 ), unsigned( 128 ) ), [&] ( const int & i )
+  {
+    a_team1( i ) = 1000000 + i + team.league_rank() * 100000;
+    a_team2( i ) = 2000000 + i + team.league_rank() * 100000;
+    a_team3( i ) = 3000000 + i + team.league_rank() * 100000;
+  });
+  team.team_barrier();
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16 ), [&] ( const int & i )
+  {
+    a_thread1( i ) = 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    a_thread2( i ) = 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    a_thread3( i ) = 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+  });
+
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128000 ), [&] ( const int & i )
+  {
+    b_team1( i ) = 1000000 + i + team.league_rank() * 100000;
+    b_team2( i ) = 2000000 + i + team.league_rank() * 100000;
+    b_team3( i ) = 3000000 + i + team.league_rank() * 100000;
+  });
+  team.team_barrier();
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16000 ), [&] ( const int & i )
+  {
+    b_thread1( i ) = 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    b_thread2( i ) = 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    b_thread3( i ) = 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+  });
+
+  team.team_barrier();
+
+  int error = 0;
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128 ), [&] ( const int & i )
+  {
+    if ( a_team1( i ) != 1000000 + i + team.league_rank() * 100000 ) error++;
+    if ( a_team2( i ) != 2000000 + i + team.league_rank() * 100000 ) error++;
+    if ( a_team3( i ) != 3000000 + i + team.league_rank() * 100000 ) error++;
+  });
+  team.team_barrier();
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16 ), [&] ( const int & i )
+  {
+    if ( a_thread1( i ) != 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( a_thread2( i ) != 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( a_thread3( i ) != 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+  });
+
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128000 ), [&] ( const int & i )
+  {
+    if ( b_team1( i ) != 1000000 + i + team.league_rank() * 100000 ) error++;
+    if ( b_team2( i ) != 2000000 + i + team.league_rank() * 100000 ) error++;
+    if ( b_team3( i ) != 3000000 + i + team.league_rank() * 100000 ) error++;
+  });
+  team.team_barrier();
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16000 ), [&] ( const int & i )
+  {
+    if ( b_thread1( i ) != 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( b_thread2( i ) != 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( b_thread3( i ) != 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+  });
+
+  return error;
+}
+
+struct TagReduce {};
+struct TagFor {};
+
+template< class ExecSpace, class ScheduleType >
+struct ClassNoShmemSizeFunction {
+  typedef typename Kokkos::TeamPolicy< ExecSpace, ScheduleType >::member_type member_type;
+
+  Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagFor &, const member_type & team ) const {
+    int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
+    errors() += error;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const TagReduce &, const member_type & team, int & error ) const {
+    error += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
+  }
+
+  void run() {
+    Kokkos::View< int, ExecSpace > d_errors = Kokkos::View< int, ExecSpace >( "Errors" );
+    errors = d_errors;
+
+    const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
+    const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
+
+    const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
+    const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
+
+    {
+      Kokkos::TeamPolicy< TagFor, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_for( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ), *this );
+      Kokkos::fence();
+
+      typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( d_errors );
+      Kokkos::deep_copy( h_errors, d_errors );
+      ASSERT_EQ( h_errors(), 0 );
+    }
+
+    {
+      int error = 0;
+      Kokkos::TeamPolicy< TagReduce, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_reduce( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ), *this, error );
+      Kokkos::fence();
+
+      ASSERT_EQ( error, 0 );
+    }
+  };
+};
+
+template< class ExecSpace, class ScheduleType >
+struct ClassWithShmemSizeFunction {
+  typedef typename Kokkos::TeamPolicy< ExecSpace, ScheduleType >::member_type member_type;
+
+  Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagFor &, const member_type & team ) const {
+    int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
+    errors() += error;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const TagReduce &, const member_type & team, int & error ) const {
+    error += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
+  }
+
+  void run() {
+    Kokkos::View< int, ExecSpace > d_errors = Kokkos::View< int, ExecSpace >( "Errors" );
+    errors = d_errors;
+
+    const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
+    const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
+
+    {
+      Kokkos::TeamPolicy< TagFor, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_for( policy.set_scratch_size( 1, Kokkos::PerTeam( per_team1 ),
+                                                     Kokkos::PerThread( per_thread1 ) ),
+                            *this );
+      Kokkos::fence();
+
+      typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( d_errors );
+      Kokkos::deep_copy( h_errors, d_errors );
+      ASSERT_EQ( h_errors(), 0 );
+    }
+
+    {
+      int error = 0;
+      Kokkos::TeamPolicy< TagReduce, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_reduce( policy.set_scratch_size( 1, Kokkos::PerTeam( per_team1 ),
+                                                        Kokkos::PerThread( per_thread1 ) ),
+                               *this, error );
+      Kokkos::fence();
+
+      ASSERT_EQ( error, 0 );
+    }
+  };
+
+  unsigned team_shmem_size( int team_size ) const {
+    const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
+    const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
+    return per_team0 + team_size * per_thread0;
+  }
+};
+
+template< class ExecSpace, class ScheduleType >
+void test_team_mulit_level_scratch_test_lambda() {
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+  Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  Kokkos::View< int, ExecSpace > d_errors( "Errors" );
+  errors = d_errors;
+
+  const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
+  const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
+
+  const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
+  const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
+
+  Kokkos::TeamPolicy< ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+  Kokkos::parallel_for( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ),
+                        KOKKOS_LAMBDA ( const typename Kokkos::TeamPolicy< ExecSpace >::member_type & team )
+  {
+    int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
+    errors() += error;
+  });
+  Kokkos::fence();
+
+  typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( errors );
+  Kokkos::deep_copy( h_errors, d_errors );
+  ASSERT_EQ( h_errors(), 0 );
+
+  int error = 0;
+  Kokkos::parallel_reduce( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ),
+                           KOKKOS_LAMBDA ( const typename Kokkos::TeamPolicy< ExecSpace >::member_type & team, int & count )
+  {
+    count += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
+  }, error );
+  ASSERT_EQ( error, 0 );
+  Kokkos::fence();
+#endif
+#endif
+}
+
+} // namespace Test
+
+namespace {
+
+template< class ExecSpace, class ScheduleType >
+struct TestMultiLevelScratchTeam {
+  TestMultiLevelScratchTeam() { run(); }
+
+  void run()
+  {
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+    Test::test_team_mulit_level_scratch_test_lambda< ExecSpace, ScheduleType >();
+#endif
+    Test::ClassNoShmemSizeFunction< ExecSpace, ScheduleType > c1;
+    c1.run();
+
+    Test::ClassWithShmemSizeFunction< ExecSpace, ScheduleType > c2;
+    c2.run();
+  }
+};
+
+} // namespace
+
+namespace Test {
+
+template< class ExecSpace >
+struct TestShmemSize {
+  TestShmemSize() { run(); }
+
+  void run()
+  {
+    typedef Kokkos::View< long***, ExecSpace > view_type;
+
+    size_t d1 = 5;
+    size_t d2 = 6;
+    size_t d3 = 7;
+
+    size_t size = view_type::shmem_size( d1, d2, d3 );
+
+    ASSERT_EQ( size, d1 * d2 * d3 * sizeof( long ) );
+
+    test_layout_stride();
+  }
+
+  void test_layout_stride()
+  {
+    int rank = 3;
+    int order[3] = {2, 0, 1};
+    int extents[3] = {100, 10, 3};
+    auto s1 = Kokkos::View<double***, Kokkos::LayoutStride, ExecSpace>::shmem_size(Kokkos::LayoutStride::order_dimensions(rank, order, extents));
+    auto s2 = Kokkos::View<double***, Kokkos::LayoutRight, ExecSpace>::shmem_size(extents[0], extents[1], extents[2]);
+    ASSERT_EQ(s1, s2);
+  }
+};
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestTeamVector.hpp b/packages/kokkos/core/unit_test/TestTeamVector.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..51884a625adb000388dc922af86e875ea576d38c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestTeamVector.hpp
@@ -0,0 +1,909 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_Timer.hpp>
+#include <iostream>
+#include <cstdlib>
+#include <cstdint>
+#include <cinttypes>
+
+namespace TestTeamVector {
+
+struct my_complex {
+  double re, im;
+  int dummy;
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex() {
+    re = 0.0;
+    im = 0.0;
+    dummy = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex( const my_complex & src ) {
+    re = src.re;
+    im = src.im;
+    dummy = src.dummy;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex & operator=( const my_complex & src ) {
+    re = src.re;
+    im = src.im;
+    dummy = src.dummy;
+    return *this ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex & operator=( const volatile my_complex & src ) {
+    re = src.re;
+    im = src.im;
+    dummy = src.dummy;
+    return *this ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex( const volatile my_complex & src ) {
+    re = src.re;
+    im = src.im;
+    dummy = src.dummy;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex( const double & val ) {
+    re = val;
+    im = 0.0;
+    dummy = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex & operator+=( const my_complex & src ) {
+    re += src.re;
+    im += src.im;
+    dummy += src.dummy;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+=( const volatile my_complex & src ) volatile {
+    re += src.re;
+    im += src.im;
+    dummy += src.dummy;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex & operator*=( const my_complex & src ) {
+    double re_tmp = re * src.re - im * src.im;
+    double im_tmp = re * src.im + im * src.re;
+    re = re_tmp;
+    im = im_tmp;
+    dummy *= src.dummy;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator*=( const volatile my_complex & src ) volatile {
+    double re_tmp = re * src.re - im * src.im;
+    double im_tmp = re * src.im + im * src.re;
+    re = re_tmp;
+    im = im_tmp;
+    dummy *= src.dummy;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator==( const my_complex & src ) {
+    return ( re == src.re ) && ( im == src.im ) && ( dummy == src.dummy );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator!=( const my_complex & src ) {
+    return ( re != src.re ) || ( im != src.im ) || ( dummy != src.dummy );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator!=( const double & val ) {
+    return ( re != val ) || ( im != 0 ) || ( dummy != 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex & operator=( const int & val ) {
+    re = val;
+    im = 0.0;
+    dummy = 0;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex & operator=( const double & val ) {
+    re = val;
+    im = 0.0;
+    dummy = 0;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  operator double() {
+    return re;
+  }
+};
+}
+
+namespace Kokkos {
+template<>
+struct reduction_identity<TestTeamVector::my_complex > {
+  typedef reduction_identity<double> t_red_ident;
+  KOKKOS_FORCEINLINE_FUNCTION static TestTeamVector::my_complex sum()
+      {return TestTeamVector::my_complex(t_red_ident::sum());}
+  KOKKOS_FORCEINLINE_FUNCTION static TestTeamVector::my_complex prod()
+      {return TestTeamVector::my_complex(t_red_ident::prod());}
+};
+}
+
+namespace TestTeamVector {
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_team_for {
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_team_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
+    typedef typename shared_int::size_type size_type;
+
+    const size_type shmemSize = team.team_size() * 13;
+    shared_int values = shared_int( team.team_shmem(), shmemSize );
+
+    if ( values.data() == nullptr || values.extent(0) < shmemSize ) {
+      printf( "FAILED to allocate shared memory of size %u\n",
+              static_cast<unsigned int>( shmemSize ) );
+    }
+    else {
+      // Initialize shared memory.
+      values( team.team_rank() ) = 0;
+
+      // Accumulate value into per thread shared memory.
+      // This is non blocking.
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i )
+      {
+        values( team.team_rank() ) += i - team.league_rank() + team.league_size() + team.team_size();
+      });
+
+      // Wait for all memory to be written.
+      team.team_barrier();
+
+      // One thread per team executes the comparison.
+      Kokkos::single( Kokkos::PerTeam( team ), [&] ()
+      {
+        Scalar test = 0;
+        Scalar value = 0;
+
+        for ( int i = 0; i < 131; ++i ) {
+          test += i - team.league_rank() + team.league_size() + team.team_size();
+        }
+
+        for ( int i = 0; i < team.team_size(); ++i ) {
+          value += values( i );
+        }
+
+        if ( test != value ) {
+          printf ( "FAILED team_parallel_for %i %i %f %f\n",
+                   team.league_rank(), team.team_rank(),
+                   static_cast<double>( test ), static_cast<double>( value ) );
+          flag() = 1;
+        }
+      });
+    }
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_team_reduce {
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_team_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    Scalar value = Scalar();
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
+    {
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    }, value );
+
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
+    {
+      Scalar test = 0;
+
+      for ( int i = 0; i < 131; ++i ) {
+        test += i - team.league_rank() + team.league_size() + team.team_size();
+      }
+
+      if ( test != value ) {
+        if ( team.league_rank() == 0 ) {
+          printf( "FAILED team_parallel_reduce %i %i %f %f %lu\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ), sizeof( Scalar ) );
+        }
+
+        flag() = 1;
+      }
+    });
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_team_reduce_reducer {
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_team_reduce_reducer( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    Scalar value = 0;
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
+    {
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    },
+      Kokkos::Experimental::Sum<Scalar>(value)
+    );
+
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
+    {
+      Scalar test = 0;
+
+      for ( int i = 0; i < 131; ++i ) {
+        test += i - team.league_rank() + team.league_size() + team.team_size();
+      }
+
+      if ( test != value ) {
+        printf( "FAILED team_vector_parallel_reduce_reducer %i %i %f %f\n",
+                team.league_rank(), team.team_rank(),
+                static_cast<double>( test ), static_cast<double>( value ) );
+
+        flag() = 1;
+      }
+    });
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_team_vector_for {
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_team_vector_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
+    typedef typename shared_int::size_type size_type;
+
+    const size_type shmemSize = team.team_size() * 13;
+    shared_int values = shared_int( team.team_shmem(), shmemSize );
+
+    if ( values.data() == nullptr || values.extent(0) < shmemSize ) {
+      printf( "FAILED to allocate shared memory of size %u\n",
+              static_cast<unsigned int>( shmemSize ) );
+    }
+    else {
+      team.team_barrier();
+
+      Kokkos::single( Kokkos::PerThread( team ), [&] ()
+      {
+        values( team.team_rank() ) = 0;
+      });
+
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i )
+      {
+        Kokkos::single( Kokkos::PerThread( team ), [&] ()
+        {
+          values( team.team_rank() ) += i - team.league_rank() + team.league_size() + team.team_size();
+        });
+      });
+
+      team.team_barrier();
+
+      Kokkos::single( Kokkos::PerTeam( team ), [&] ()
+      {
+        Scalar test = 0;
+        Scalar value = 0;
+
+        for ( int i = 0; i < 131; ++i ) {
+          test += i - team.league_rank() + team.league_size() + team.team_size();
+        }
+
+        for ( int i = 0; i < team.team_size(); ++i ) {
+          value += values( i );
+        }
+
+        if ( test != value ) {
+          printf( "FAILED team_vector_parallel_for %i %i %f %f\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ) );
+
+          flag() = 1;
+        }
+      });
+    }
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_team_vector_reduce {
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_team_vector_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    Scalar value = Scalar();
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
+    {
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    }, value );
+
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
+    {
+      Scalar test = 0;
+
+      for ( int i = 0; i < 131; ++i ) {
+        test += i - team.league_rank() + team.league_size() + team.team_size();
+      }
+
+      if ( test != value ) {
+        if ( team.league_rank() == 0 ) {
+          printf( "FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ), sizeof( Scalar ) );
+        }
+
+        flag() = 1;
+      }
+    });
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_team_vector_reduce_reducer {
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_team_vector_reduce_reducer( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    Scalar value = 0;
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
+    {
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    },
+      Kokkos::Experimental::Sum<Scalar>(value)
+    );
+
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
+    {
+      Scalar test = 0;
+
+      for ( int i = 0; i < 131; ++i ) {
+         test += i - team.league_rank() + team.league_size() + team.team_size();
+      }
+
+      if ( test != value ) {
+        printf( "FAILED team_vector_parallel_reduce_reducer %i %i %f %f\n",
+                team.league_rank(), team.team_rank(),
+                static_cast<double>( test ), static_cast<double>( value ) );
+
+        flag() = 1;
+      }
+    });
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_vec_single {
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_vec_single( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    // Warning: this test case intentionally violates permissable semantics.
+    // It is not valid to get references to members of the enclosing region
+    // inside a parallel_for and write to it.
+    Scalar value = 0;
+
+    Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i )
+    {
+      value = i; // This write is violating Kokkos semantics for nested parallelism.
+    });
+
+    Kokkos::single( Kokkos::PerThread( team ), [&] ( Scalar & val )
+    {
+      val = 1;
+    }, value );
+
+    Scalar value2 = 0;
+    Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val )
+    {
+      val += value;
+    }, value2 );
+
+    if ( value2 != ( value * 13 ) ) {
+      printf( "FAILED vector_single broadcast %i %i %f %f\n",
+              team.league_rank(), team.team_rank(), (double) value2, (double) value );
+
+      flag() = 1;
+    }
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_vec_for {
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_vec_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
+
+    shared_int values = shared_int( team.team_shmem(), team.team_size() * 13 );
+
+    if ( values.data() == nullptr || values.extent(0) < (unsigned) team.team_size() * 13 ) {
+      printf( "FAILED to allocate memory of size %i\n", static_cast<int>( team.team_size() * 13 ) );
+      flag() = 1;
+    }
+    else {
+      Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i )
+      {
+        values( 13 * team.team_rank() + i ) =
+          i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
+      });
+
+      Kokkos::single( Kokkos::PerThread( team ), [&] ()
+      {
+        Scalar test = 0;
+        Scalar value = 0;
+
+        for ( int i = 0; i < 13; ++i ) {
+          test += i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
+          value += values( 13 * team.team_rank() + i );
+        }
+
+        if ( test != value ) {
+          printf( "FAILED vector_par_for %i %i %f %f\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ) );
+
+          flag() = 1;
+        }
+      });
+    }
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_vec_red {
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_vec_red( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    Scalar value = 0;
+
+    // When no reducer is given the default is summation.
+    Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val )
+    {
+      val += i;
+    }, value );
+
+    Kokkos::single( Kokkos::PerThread( team ), [&] ()
+    {
+      Scalar test = 0;
+
+      for ( int i = 0; i < 13; i++ ) test += i;
+
+      if ( test != value ) {
+        printf( "FAILED vector_par_reduce %i %i %f %f\n",
+                team.league_rank(), team.team_rank(), (double) test, (double) value );
+
+        flag() = 1;
+      }
+    });
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_vec_red_reducer {
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_vec_red_reducer( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    // Must initialize to the identity value for the reduce operation
+    // for this test:
+    //   ( identity, operation ) = ( 1 , *= )
+    Scalar value = 1;
+
+    Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val )
+    {
+      val *= ( i % 5 + 1 );
+    }, Kokkos::Experimental::Prod<Scalar>(value)
+    );
+
+    Kokkos::single( Kokkos::PerThread( team ), [&] ()
+    {
+      Scalar test = 1;
+
+      for ( int i = 0; i < 13; i++ ) test *= ( i % 5 + 1 );
+
+      if ( test != value ) {
+        printf( "FAILED vector_par_reduce_reducer %i %i %f %f\n",
+                team.league_rank(), team.team_rank(), (double) test, (double) value );
+
+        flag() = 1;
+      }
+    });
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_vec_scan {
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_vec_scan( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    Kokkos::parallel_scan( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val, bool final )
+    {
+      val += i;
+
+      if ( final ) {
+        Scalar test = 0;
+        for ( int k = 0; k <= i; k++ ) test += k;
+
+        if ( test != val ) {
+          printf( "FAILED vector_par_scan %i %i %f %f\n",
+                  team.league_rank(), team.team_rank(), (double) test, (double) val );
+
+          flag() = 1;
+        }
+      }
+    });
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+struct functor_reduce {
+  typedef double value_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team, double & sum ) const {
+    sum += team.league_rank() * 100 + team.thread_rank();
+  }
+};
+
+template< typename Scalar, class ExecutionSpace >
+bool test_scalar( int nteams, int team_size, int test ) {
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > d_flag( "flag" );
+  typename Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace >::HostMirror h_flag( "h_flag" );
+  h_flag() = 0;
+  Kokkos::deep_copy( d_flag, h_flag );
+
+  if ( test == 0 ) {
+    Kokkos::parallel_for( std::string( "A" ), Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_red< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 1 ) {
+    #if defined(KOKKOS_ENABLE_CUDA)
+    #if defined(KOKKOS_CUDA_CLANG_WORKAROUND) || defined(KOKKOS_ARCH_PASCAL)
+    if(!std::is_same<ExecutionSpace,Kokkos::Cuda>::value)
+    #endif
+    #endif
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_red_reducer< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 2 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_scan< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 3 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_for< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 4 ) {
+    Kokkos::parallel_for( "B", Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_single< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 5 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ),
+                          functor_team_for< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 6 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ),
+                          functor_team_reduce< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 7 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ),
+                          functor_team_reduce_reducer< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 8 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_team_vector_for< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 9 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_team_vector_reduce< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 10 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_team_vector_reduce_reducer< Scalar, ExecutionSpace >( d_flag ) );
+  }
+
+  Kokkos::deep_copy( h_flag, d_flag );
+
+  return ( h_flag() == 0 );
+}
+
+template< class ExecutionSpace >
+bool Test( int test ) {
+  bool passed = true;
+  passed = passed && test_scalar< int, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< long long int, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< float, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< double, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< my_complex, ExecutionSpace >( 317, 33, test );
+
+  return passed;
+}
+
+} // namespace TestTeamVector
+
+namespace Test {
+
+// Computes y^T*A*x
+// ( modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar )
+
+#if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || (defined( KOKKOS_ENABLE_CUDA_LAMBDA ) && (8000 <= CUDA_VERSION))
+
+template< typename ScalarType, class DeviceType >
+class TestTripleNestedReduce
+{
+public:
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
+
+  TestTripleNestedReduce( const size_type & nrows, const size_type & ncols
+                        , const size_type & team_size, const size_type & vector_length )
+  {
+    run_test( nrows, ncols, team_size, vector_length );
+  }
+
+  void run_test( const size_type & nrows, const size_type & ncols
+               , const size_type & team_size, const size_type & vector_length )
+  {
+    //typedef Kokkos::LayoutLeft Layout;
+    typedef Kokkos::LayoutRight Layout;
+
+    typedef Kokkos::View< ScalarType*, DeviceType >            ViewVector;
+    typedef Kokkos::View< ScalarType**, Layout, DeviceType >   ViewMatrix;
+
+    ViewVector y( "y", nrows );
+    ViewVector x( "x", ncols );
+    ViewMatrix A( "A", nrows, ncols );
+
+    typedef Kokkos::RangePolicy<DeviceType> range_policy;
+
+    // Initialize y vector.
+    Kokkos::parallel_for( range_policy( 0, nrows ), KOKKOS_LAMBDA ( const int i ) { y( i ) = 1; } );
+
+    // Initialize x vector.
+    Kokkos::parallel_for( range_policy( 0, ncols ), KOKKOS_LAMBDA ( const int i ) { x( i ) = 1; } );
+
+    typedef Kokkos::TeamPolicy< DeviceType >                        team_policy;
+    typedef typename Kokkos::TeamPolicy< DeviceType >::member_type  member_type;
+
+    // Initialize A matrix, note 2D indexing computation.
+    Kokkos::parallel_for( team_policy( nrows, Kokkos::AUTO ), KOKKOS_LAMBDA ( const member_type & teamMember ) {
+      const int j = teamMember.league_rank();
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember, ncols ), [&] ( const int i ) {
+        A( j, i ) = 1;
+      } );
+    } );
+
+    // Three level parallelism kernel to force caching of vector x.
+    ScalarType result = 0.0;
+    int chunk_size = 128;
+    Kokkos::parallel_reduce( team_policy( nrows / chunk_size, team_size, vector_length ),
+                             KOKKOS_LAMBDA ( const member_type & teamMember, double & update ) {
+      const int row_start = teamMember.league_rank() * chunk_size;
+      const int row_end   = row_start + chunk_size;
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember, row_start, row_end ), [&] ( const int i ) {
+        ScalarType sum_i = 0.0;
+        Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( teamMember, ncols ), [&] ( const int j, ScalarType &innerUpdate ) {
+          innerUpdate += A( i, j ) * x( j );
+        }, sum_i );
+        Kokkos::single( Kokkos::PerThread( teamMember ), [&] () {
+          update += y( i ) * sum_i;
+        } );
+      } );
+    }, result );
+
+    const ScalarType solution = (ScalarType) nrows * (ScalarType) ncols;
+
+    if ( int64_t(solution) != int64_t(result) ) {
+      printf( "  TestTripleNestedReduce failed solution(%" PRId64 ") != result(%" PRId64 "),"
+              " nrows(%" PRId32 ") ncols(%" PRId32 ") league_size(%" PRId32 ") team_size(%" PRId32 ")\n"
+            , int64_t(solution)
+            , int64_t(result)
+            , int32_t(nrows)
+            , int32_t(ncols)
+            , int32_t(nrows/chunk_size)
+            , int32_t(team_size)
+            );
+    }
+
+    ASSERT_EQ( solution, result );
+  }
+};
+
+#else // #if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || defined( KOKKOS_ENABLE_CUDA_LAMBDA )
+
+template< typename ScalarType, class DeviceType >
+class TestTripleNestedReduce
+{
+public:
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
+
+  TestTripleNestedReduce( const size_type &, const size_type
+                        , const size_type &, const size_type )
+  {}
+};
+
+#endif
+        
+#if !defined(KOKKOS_CUDA_CLANG_WORKAROUND)
+TEST_F( TEST_CATEGORY, team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< TEST_EXECSPACE >( 10 ) ) );
+}
+#endif
+
+#ifdef KOKKOS_COMPILER_GNU
+#if ( KOKKOS_COMPILER_GNU == 472 )
+#define SKIP_TEST
+#endif
+#endif
+
+#if !defined(KOKKOS_CUDA_CLANG_WORKAROUND)
+#ifndef SKIP_TEST
+TEST_F( TEST_CATEGORY, triple_nested_parallelism )
+{
+  TestTripleNestedReduce< double, TEST_EXECSPACE >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, TEST_EXECSPACE >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, TEST_EXECSPACE >( 8192, 2048, 16, 16 );
+}
+#endif
+#endif
+}
diff --git a/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac32aeb0bedf79bde394d26770c26dd4a5b47dd0
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
@@ -0,0 +1,216 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#define KOKKOS_PRAGMA_UNROLL(a)
+
+namespace {
+
+template< class Scalar, class ExecutionSpace >
+struct SumPlain {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
+
+  type view;
+
+  SumPlain( type view_ ) : view( view_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( int i, Scalar & val ) {
+    val += Scalar();
+  }
+};
+
+template< class Scalar, class ExecutionSpace >
+struct SumInitJoinFinalValueType {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
+  typedef Scalar value_type;
+
+  type view;
+
+  SumInitJoinFinalValueType( type view_ ) : view( view_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & val ) const {
+    val = value_type();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & val, volatile value_type & src ) const {
+    val += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i, value_type & val ) const {
+    val += value_type();
+  }
+};
+
+template< class Scalar, class ExecutionSpace >
+struct SumInitJoinFinalValueType2 {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
+  typedef Scalar value_type;
+
+  type view;
+
+  SumInitJoinFinalValueType2( type view_ ) : view( view_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( volatile value_type & val ) const {
+    val = value_type();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & val, const volatile value_type & src ) const {
+    val += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i, value_type & val ) const {
+    val += value_type();
+  }
+};
+
+template< class Scalar, class ExecutionSpace >
+struct SumInitJoinFinalValueTypeArray {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
+  typedef Scalar value_type[];
+
+  type view;
+  int n;
+
+  SumInitJoinFinalValueTypeArray( type view_, int n_ ) : view( view_ ), n( n_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type val ) const {
+    for ( int k = 0; k < n; k++ ) {
+      val[k] = 0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type val, const volatile value_type src ) const {
+    for ( int k = 0; k < n; k++ ) {
+      val[k] += src[k];
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i, value_type val ) const {
+    for ( int k = 0; k < n; k++ ) {
+      val[k] += k * i;
+    }
+  }
+};
+
+template< class Scalar, class ExecutionSpace >
+struct SumWrongInitJoinFinalValueType {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
+  typedef Scalar value_type;
+
+  type view;
+
+  SumWrongInitJoinFinalValueType( type view_ ) : view( view_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( double & val ) const {
+    val = double();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & val, const value_type & src ) const {
+    val += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i, value_type & val ) const {
+    val += value_type();
+  }
+};
+
+template< class Scalar, class ExecutionSpace >
+void TestTemplateMetaFunctions() {
+  typedef typename Kokkos::View< Scalar*, ExecutionSpace > type;
+  type a( "A", 100 );
+/*
+  int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit< SumPlain<Scalar, ExecutionSpace>, Scalar & >::value;
+  ASSERT_EQ( sum_plain_has_init_arg, 0 );
+  int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg, 1 );
+  int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg2, 1 );
+  int sum_wronginitjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit< SumWrongInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_wronginitjoinfinalvaluetype_has_init_arg, 0 );
+
+  //int sum_initjoinfinalvaluetypearray_has_init_arg = Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueTypeArray<Scalar, ExecutionSpace>, Scalar[] >::value;
+  //ASSERT_EQ( sum_initjoinfinalvaluetypearray_has_init_arg, 1 );
+
+  //printf( "Values Init: %i %i %i\n", sum_plain_has_init_arg, sum_initjoinfinalvaluetype_has_init_arg, sum_wronginitjoinfinalvaluetype_has_init_arg );
+
+  int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumPlain<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_plain_has_join_arg, 0 );
+  int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg, 1 );
+  int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin< SumInitJoinFinalValueType2<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg2, 1 );
+  int sum_wronginitjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumWrongInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_wronginitjoinfinalvaluetype_has_join_arg, 0 );
+
+  //printf( "Values Join: %i %i %i\n", sum_plain_has_join_arg, sum_initjoinfinalvaluetype_has_join_arg, sum_wronginitjoinfinalvaluetype_has_join_arg );
+*/
+}
+
+} // namespace
+
+namespace Test {
+TEST_F( TEST_CATEGORY, template_meta_functions )
+{
+  TestTemplateMetaFunctions< int, TEST_EXECSPACE >();
+}
+}
+
diff --git a/packages/kokkos/core/unit_test/TestTile.hpp b/packages/kokkos/core/unit_test/TestTile.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..704c7f994090144f034a1c435d0bd85af80f8ec6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestTile.hpp
@@ -0,0 +1,169 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#ifndef TEST_TILE_HPP
+#define TEST_TILE_HPP
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_ViewTile.hpp>
+
+namespace TestTile {
+
+template < typename Device, typename TileLayout >
+struct ReduceTileErrors
+{
+  typedef Device execution_space;
+  typedef Kokkos::View< ptrdiff_t**, TileLayout, Device >  array_type;
+  typedef Kokkos::View< ptrdiff_t[ TileLayout::N0 ][ TileLayout::N1 ], Kokkos::LayoutLeft, Device >  tile_type;
+  typedef ptrdiff_t value_type;
+
+  array_type m_array;
+
+  ReduceTileErrors( array_type a ) : m_array( a ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & errors ) { errors = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & errors,
+                    const volatile value_type & src_errors )
+  {
+    errors += src_errors;
+  }
+
+  // Initialize.
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t iwork ) const
+  {
+    const size_t i = iwork % m_array.extent(0);
+    const size_t j = iwork / m_array.extent(0);
+
+    if ( j < m_array.extent(1) ) {
+      m_array( i, j ) = &m_array( i, j ) - &m_array( 0, 0 );
+
+      //printf( "m_array(%d, %d) = %d\n", int( i ), int( j ), int( m_array( i, j ) ) );
+    }
+  }
+
+  // Verify:
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t iwork, value_type & errors ) const
+  {
+    const size_t tile_dim0 = ( m_array.extent(0) + TileLayout::N0 - 1 ) / TileLayout::N0;
+    const size_t tile_dim1 = ( m_array.extent(1) + TileLayout::N1 - 1 ) / TileLayout::N1;
+
+    const size_t itile = iwork % tile_dim0;
+    const size_t jtile = iwork / tile_dim0;
+
+    if ( jtile < tile_dim1 ) {
+      tile_type tile = Kokkos::tile_subview( m_array, itile, jtile );
+
+      if ( tile( 0, 0 ) != ptrdiff_t( ( itile + jtile * tile_dim0 ) * TileLayout::N0 * TileLayout::N1 ) ) {
+        ++errors;
+      }
+      else {
+        for ( size_t j = 0; j < size_t( TileLayout::N1 ); ++j ) {
+          for ( size_t i = 0; i < size_t( TileLayout::N0 ); ++i ) {
+            const size_t iglobal = i + itile * TileLayout::N0;
+            const size_t jglobal = j + jtile * TileLayout::N1;
+
+            if ( iglobal < m_array.extent(0) && jglobal < m_array.extent(1) ) {
+              if ( tile( i, j ) != ptrdiff_t( tile( 0, 0 ) + i + j * TileLayout::N0 ) ) ++errors;
+
+              //printf( "tile(%d, %d)(%d, %d) = %d\n", int( itile ), int( jtile ), int( i ), int( j ), int( tile( i, j ) ) );
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template< class Space, unsigned N0, unsigned N1 >
+void test( const size_t dim0, const size_t dim1 )
+{
+  typedef Kokkos::LayoutTileLeft< N0, N1 >  array_layout;
+  typedef ReduceTileErrors< Space, array_layout > functor_type;
+
+  const size_t tile_dim0 = ( dim0 + N0 - 1 ) / N0;
+  const size_t tile_dim1 = ( dim1 + N1 - 1 ) / N1;
+
+  typename functor_type::array_type array( "", dim0, dim1 );
+
+  Kokkos::parallel_for( Kokkos::RangePolicy< Space, size_t >( 0, dim0 * dim1 ), functor_type( array ) );
+
+  ptrdiff_t error = 0;
+
+  Kokkos::parallel_reduce( Kokkos::RangePolicy< Space, size_t >( 0, tile_dim0 * tile_dim1 ), functor_type( array ), error );
+
+  EXPECT_EQ( error, ptrdiff_t( 0 ) );
+}
+
+} // namespace TestTile
+
+namespace Test {
+TEST_F( TEST_CATEGORY, tile_layout )
+{
+  TestTile::test< TEST_EXECSPACE, 1, 1 >( 1, 1 );
+  TestTile::test< TEST_EXECSPACE, 1, 1 >( 2, 3 );
+  TestTile::test< TEST_EXECSPACE, 1, 1 >( 9, 10 );
+
+  TestTile::test< TEST_EXECSPACE, 2, 2 >( 1, 1 );
+  TestTile::test< TEST_EXECSPACE, 2, 2 >( 2, 3 );
+  TestTile::test< TEST_EXECSPACE, 2, 2 >( 4, 4 );
+  TestTile::test< TEST_EXECSPACE, 2, 2 >( 9, 9 );
+
+  TestTile::test< TEST_EXECSPACE, 2, 4 >( 9, 9 );
+  TestTile::test< TEST_EXECSPACE, 4, 2 >( 9, 9 );
+
+  TestTile::test< TEST_EXECSPACE, 4, 4 >( 1, 1 );
+  TestTile::test< TEST_EXECSPACE, 4, 4 >( 4, 4 );
+  TestTile::test< TEST_EXECSPACE, 4, 4 >( 9, 9 );
+  TestTile::test< TEST_EXECSPACE, 4, 4 >( 9, 11 );
+
+  TestTile::test< TEST_EXECSPACE, 8, 8 >( 1, 1 );
+  TestTile::test< TEST_EXECSPACE, 8, 8 >( 4, 4 );
+  TestTile::test< TEST_EXECSPACE, 8, 8 >( 9, 9 );
+  TestTile::test< TEST_EXECSPACE, 8, 8 >( 9, 11 );
+}
+
+}
+#endif //TEST_TILE_HPP
diff --git a/packages/kokkos/core/unit_test/TestUniqueToken.hpp b/packages/kokkos/core/unit_test/TestUniqueToken.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5c8324c63d7fb0f3699be4130b962136c4dc2afc
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestUniqueToken.hpp
@@ -0,0 +1,138 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+template< class Space >
+class TestUniqueToken
+{
+public:
+  typedef typename Space::execution_space  execution_space;
+  typedef Kokkos::View< int * , execution_space > view_type ;
+
+  Kokkos::Experimental::UniqueToken< execution_space , Kokkos::Experimental::UniqueTokenScope::Global > tokens ;
+
+  view_type verify ;
+  view_type counts ;
+  view_type errors ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( long ) const
+  {
+    const int32_t t = tokens.acquire();
+
+    bool ok = true ;
+
+    ok = ok && 0 <= t ;
+    ok = ok && t < tokens.size();
+    ok = ok && 0 == Kokkos::atomic_fetch_add( & verify(t) , 1 );
+
+    Kokkos::atomic_fetch_add( & counts(t) , 1 );
+
+    ok = ok && 1 == Kokkos::atomic_fetch_add( & verify(t) , -1 );
+
+    if ( ! ok ) { Kokkos::atomic_fetch_add( & errors(0) , 1 ) ; }
+
+    tokens.release(t);
+  }
+
+  TestUniqueToken()
+    : tokens( execution_space() )
+    , verify( "TestUniqueTokenVerify" , tokens.size() )
+    , counts( "TestUniqueTokenCounts" , tokens.size() )
+    , errors( "TestUniqueTokenErrors" , 1 )
+    {}
+
+  static void run()
+    {
+      using policy = Kokkos::RangePolicy<execution_space> ;
+
+      TestUniqueToken self ;
+
+      {
+        const int duplicate = 100 ;
+        const long n = duplicate * self.tokens.size();
+
+        Kokkos::parallel_for( policy(0,n) , self );
+        Kokkos::parallel_for( policy(0,n) , self );
+        Kokkos::parallel_for( policy(0,n) , self );
+        Kokkos::fence();
+      }
+
+      typename view_type::HostMirror host_counts =
+        Kokkos::create_mirror_view( self.counts );
+
+      Kokkos::deep_copy( host_counts , self.counts );
+
+      int32_t max = 0 ;
+
+      {
+        const long n = host_counts.extent(0);
+        for ( long i = 0 ; i < n ; ++i ) {
+          if ( max < host_counts[i] ) max = host_counts[i] ;
+        }
+      }
+
+      std::cout << "TestUniqueToken max reuse = " << max << std::endl ;
+
+      typename view_type::HostMirror host_errors =
+        Kokkos::create_mirror_view( self.errors );
+
+      Kokkos::deep_copy( host_errors , self.errors );
+
+      ASSERT_EQ( host_errors(0) , 0 );
+    }
+};
+
+
+TEST_F( TEST_CATEGORY, unique_token )
+{
+  TestUniqueToken< TEST_EXECSPACE >::run();
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/TestUtilities.hpp b/packages/kokkos/core/unit_test/TestUtilities.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3deabad032ae2bbb8a7798d57ee052c3ee7d4a6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestUtilities.hpp
@@ -0,0 +1,301 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+inline
+void test_utilities()
+{
+  using namespace Kokkos::Impl;
+
+  {
+    using i = integer_sequence< int >;
+    using j = make_integer_sequence< int, 0 >;
+
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
+    static_assert( i::size() == 0u, "Error: integer_sequence.size()" );
+  }
+
+  {
+    using i = integer_sequence< int, 0 >;
+    using j = make_integer_sequence< int, 1 >;
+
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
+    static_assert( i::size() == 1u, "Error: integer_sequence.size()" );
+
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+  }
+
+  {
+    using i = integer_sequence< int, 0, 1 >;
+    using j = make_integer_sequence< int, 2 >;
+
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
+    static_assert( i::size() == 2u, "Error: integer_sequence.size()" );
+
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+  }
+
+  {
+    using i = integer_sequence< int, 0, 1, 2 >;
+    using j = make_integer_sequence< int, 3 >;
+
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
+    static_assert( i::size() == 3u, "Error: integer_sequence.size()" );
+
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+  }
+
+  {
+    using i = integer_sequence< int, 0, 1, 2, 3 >;
+    using j = make_integer_sequence< int, 4 >;
+
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
+    static_assert( i::size() == 4u, "Error: integer_sequence.size()" );
+
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+  }
+
+  {
+    using i = integer_sequence< int, 0, 1, 2, 3, 4 >;
+    using j = make_integer_sequence< int, 5 >;
+
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
+    static_assert( i::size() == 5u, "Error: integer_sequence.size()" );
+
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+  }
+
+  {
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5 >;
+    using j = make_integer_sequence< int, 6 >;
+
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
+    static_assert( i::size() == 6u, "Error: integer_sequence.size()" );
+
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+  }
+
+  {
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6 >;
+    using j = make_integer_sequence< int, 7 >;
+
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
+    static_assert( i::size() == 7u, "Error: integer_sequence.size()" );
+
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
+  }
+
+  {
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6, 7 >;
+    using j = make_integer_sequence< int, 8 >;
+
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
+    static_assert( i::size() == 8u, "Error: integer_sequence.size()" );
+
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 7, i >::value == 7, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 7, i{} ) == 7, "Error: at(unsigned, integer_sequence)" );
+  }
+
+  {
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6, 7, 8 >;
+    using j = make_integer_sequence< int, 9 >;
+
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
+    static_assert( i::size() == 9u, "Error: integer_sequence.size()" );
+
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 7, i >::value == 7, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 8, i >::value == 8, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 7, i{} ) == 7, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 8, i{} ) == 8, "Error: at(unsigned, integer_sequence)" );
+  }
+
+  {
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 >;
+    using j = make_integer_sequence< int, 10 >;
+
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
+    static_assert( i::size() == 10u, "Error: integer_sequence.size()" );
+
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 7, i >::value == 7, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 8, i >::value == 8, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 9, i >::value == 9, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 7, i{} ) == 7, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 8, i{} ) == 8, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 9, i{} ) == 9, "Error: at(unsigned, integer_sequence)" );
+  }
+
+  {
+    using i = make_integer_sequence< int, 5 >;
+    using r = reverse_integer_sequence< i >;
+    using gr = integer_sequence< int, 4, 3, 2, 1, 0 >;
+
+    static_assert( std::is_same< r, gr >::value, "Error: reverse_integer_sequence" );
+  }
+
+  {
+    using s = make_integer_sequence< int, 10 >;
+    using e = exclusive_scan_integer_sequence< s >;
+    using i = inclusive_scan_integer_sequence< s >;
+
+    using ge = integer_sequence< int, 0, 0, 1, 3, 6, 10, 15, 21, 28, 36 >;
+    using gi = integer_sequence< int, 0, 1, 3, 6, 10, 15, 21, 28, 36, 45 >;
+
+    static_assert( e::value == 45, "Error: scan value" );
+    static_assert( i::value == 45, "Error: scan value" );
+
+    static_assert( std::is_same< e::type, ge >::value, "Error: exclusive_scan" );
+    static_assert( std::is_same< i::type, gi >::value, "Error: inclusive_scan" );
+  }
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestViewAPI.hpp b/packages/kokkos/core/unit_test/TestViewAPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f624fab9321b4244b495f82831f534756e4635d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewAPI.hpp
@@ -0,0 +1,1543 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+namespace Test {
+
+template< class T, class ... P >
+size_t allocation_count( const Kokkos::View< T, P... > & view )
+{
+  const size_t card  = view.size();
+  const size_t alloc = view.span();
+
+  const int memory_span = Kokkos::View< int* >::required_allocation_size( 100 );
+
+  return ( card <= alloc && memory_span == 400 ) ? alloc : 0;
+}
+
+/*--------------------------------------------------------------------------*/
+
+template< typename T, class DeviceType >
+struct TestViewOperator
+{
+  typedef typename DeviceType::execution_space  execution_space;
+
+  enum { N = 1000 };
+  enum { D = 3 };
+
+  typedef Kokkos::View< T*[D], execution_space > view_type;
+
+  const view_type v1;
+  const view_type v2;
+
+  TestViewOperator()
+    : v1( "v1", N )
+    , v2( "v2", N )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned i ) const
+  {
+    const unsigned X = 0;
+    const unsigned Y = 1;
+    const unsigned Z = 2;
+
+    v2( i, X ) = v1( i, X );
+    v2( i, Y ) = v1( i, Y );
+    v2( i, Z ) = v1( i, Z );
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class DataType,
+          class DeviceType,
+          unsigned Rank = Kokkos::ViewTraits< DataType >::rank >
+struct TestViewOperator_LeftAndRight;
+
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 8 >
+{
+  typedef typename DeviceType::execution_space    execution_space;
+  typedef typename DeviceType::memory_space       memory_space;
+  typedef typename execution_space::size_type     size_type;
+
+  typedef int value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update,
+                    const volatile value_type & input )
+  { update |= input; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0; }
+
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
+
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  void testit()
+  {
+    int error_flag = 0;
+
+    Kokkos::parallel_reduce( 1, *this, error_flag );
+
+    ASSERT_EQ( error_flag, 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type, value_type & update ) const
+  {
+    long offset = -1;
+
+    for ( unsigned i7 = 0; i7 < unsigned( left.extent(7) ); ++i7 )
+    for ( unsigned i6 = 0; i6 < unsigned( left.extent(6) ); ++i6 )
+    for ( unsigned i5 = 0; i5 < unsigned( left.extent(5) ); ++i5 )
+    for ( unsigned i4 = 0; i4 < unsigned( left.extent(4) ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.extent(3) ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.extent(2) ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.extent(1) ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.extent(0) ); ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5, i6, i7 ) -
+                     & left(  0,  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
+
+      if ( & left( i0, i1, i2, i3, i4, i5, i6, i7 ) !=
+           & left_stride( i0, i1, i2, i3, i4, i5, i6, i7 ) ) {
+        update |= 4;
+      }
+    }
+
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.extent(0) ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.extent(1) ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.extent(2) ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.extent(3) ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.extent(4) ); ++i4 )
+    for ( unsigned i5 = 0; i5 < unsigned( right.extent(5) ); ++i5 )
+    for ( unsigned i6 = 0; i6 < unsigned( right.extent(6) ); ++i6 )
+    for ( unsigned i7 = 0; i7 < unsigned( right.extent(7) ); ++i7 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5, i6, i7 ) -
+                     & right(  0,  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
+
+      if ( & right( i0, i1, i2, i3, i4, i5, i6, i7 ) !=
+           & right_stride( i0, i1, i2, i3, i4, i5, i6, i7 ) ) {
+        update |= 8;
+      }
+    }
+  }
+};
+
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 7 >
+{
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
+
+  typedef int value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update,
+                    const volatile value_type & input )
+  { update |= input; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0; }
+
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  void testit()
+  {
+
+    int error_flag = 0;
+
+    Kokkos::parallel_reduce( 1, *this, error_flag );
+
+    ASSERT_EQ( error_flag, 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type, value_type & update ) const
+  {
+    long offset = -1;
+
+    for ( unsigned i6 = 0; i6 < unsigned( left.extent(6) ); ++i6 )
+    for ( unsigned i5 = 0; i5 < unsigned( left.extent(5) ); ++i5 )
+    for ( unsigned i4 = 0; i4 < unsigned( left.extent(4) ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.extent(3) ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.extent(2) ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.extent(1) ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.extent(0) ); ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5, i6 ) -
+                     & left(  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
+    }
+
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.extent(0) ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.extent(1) ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.extent(2) ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.extent(3) ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.extent(4) ); ++i4 )
+    for ( unsigned i5 = 0; i5 < unsigned( right.extent(5) ); ++i5 )
+    for ( unsigned i6 = 0; i6 < unsigned( right.extent(6) ); ++i6 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5, i6 ) -
+                     & right(  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
+    }
+  }
+};
+
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 6 >
+{
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
+
+  typedef int value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update,
+                    const volatile value_type & input )
+  { update |= input; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0; }
+
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  void testit()
+  {
+
+    int error_flag = 0;
+
+    Kokkos::parallel_reduce( 1, *this, error_flag );
+
+    ASSERT_EQ( error_flag, 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type, value_type & update ) const
+  {
+    long offset = -1;
+
+    for ( unsigned i5 = 0; i5 < unsigned( left.extent(5) ); ++i5 )
+    for ( unsigned i4 = 0; i4 < unsigned( left.extent(4) ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.extent(3) ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.extent(2) ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.extent(1) ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.extent(0) ); ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5 ) -
+                     & left(  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
+    }
+
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.extent(0) ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.extent(1) ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.extent(2) ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.extent(3) ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.extent(4) ); ++i4 )
+    for ( unsigned i5 = 0; i5 < unsigned( right.extent(5) ); ++i5 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5 ) -
+                     & right(  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
+    }
+  }
+};
+
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 5 >
+{
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
+
+  typedef int value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update,
+                    const volatile value_type & input )
+  { update |= input; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0; }
+
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
+
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  void testit()
+  {
+
+    int error_flag = 0;
+
+    Kokkos::parallel_reduce( 1, *this, error_flag );
+
+    ASSERT_EQ( error_flag, 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type, value_type & update ) const
+  {
+    long offset = -1;
+
+    for ( unsigned i4 = 0; i4 < unsigned( left.extent(4) ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.extent(3) ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.extent(2) ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.extent(1) ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.extent(0) ); ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4 ) -
+                     & left(  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
+
+      if ( & left( i0, i1, i2, i3, i4 ) !=
+           & left_stride( i0, i1, i2, i3, i4 ) ) { update |= 4; }
+    }
+
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.extent(0) ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.extent(1) ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.extent(2) ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.extent(3) ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.extent(4) ); ++i4 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4 ) -
+                     & right(  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
+
+      if ( & right( i0, i1, i2, i3, i4 ) !=
+           & right_stride( i0, i1, i2, i3, i4 ) ) { update |= 8; }
+    }
+  }
+};
+
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 4 >
+{
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
+
+  typedef int value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update,
+                    const volatile value_type & input )
+  { update |= input; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0; }
+
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  void testit()
+  {
+
+    int error_flag = 0;
+
+    Kokkos::parallel_reduce( 1, *this, error_flag );
+
+    ASSERT_EQ( error_flag, 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type, value_type & update ) const
+  {
+    long offset = -1;
+
+    for ( unsigned i3 = 0; i3 < unsigned( left.extent(3) ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.extent(2) ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.extent(1) ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.extent(0) ); ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3 ) -
+                     & left(  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
+    }
+
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.extent(0) ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.extent(1) ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.extent(2) ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.extent(3) ); ++i3 )
+    {
+      const long j = & right( i0, i1, i2, i3 ) -
+                     & right(  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
+    }
+  }
+};
+
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 3 >
+{
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
+
+  typedef int value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update,
+                    const volatile value_type & input )
+  { update |= input; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0; }
+
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
+
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
+
+  TestViewOperator_LeftAndRight()
+    : left(  std::string( "left" ) )
+    , right( std::string( "right" ) )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  void testit()
+  {
+
+    int error_flag = 0;
+
+    Kokkos::parallel_reduce( 1, *this, error_flag );
+
+    ASSERT_EQ( error_flag, 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type, value_type & update ) const
+  {
+    long offset = -1;
+
+    for ( unsigned i2 = 0; i2 < unsigned( left.extent(2) ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.extent(1) ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.extent(0) ); ++i0 )
+    {
+      const long j = & left( i0, i1, i2 ) -
+                     & left(  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
+
+      if ( & left( i0, i1, i2 ) != & left_stride( i0, i1, i2 ) ) { update |= 4; }
+    }
+
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.extent(0) ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.extent(1) ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.extent(2) ); ++i2 )
+    {
+      const long j = & right( i0, i1, i2 ) -
+                     & right(  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
+
+      if ( & right( i0, i1, i2 ) != & right_stride( i0, i1, i2 ) ) { update |= 8; }
+    }
+
+    for ( unsigned i0 = 0; i0 < unsigned( left.extent(0) ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.extent(1) ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.extent(2) ); ++i2 )
+    {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+        if ( & left( i0, i1, i2 )  != & left( i0, i1, i2, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+        if ( & right( i0, i1, i2 ) != & right( i0, i1, i2, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+#else
+        if ( & left( i0, i1, i2 )  != & left.access( i0, i1, i2, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+        if ( & right( i0, i1, i2 ) != & right.access( i0, i1, i2, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+#endif
+    }
+  }
+};
+
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 2 >
+{
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
+
+  typedef int value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update,
+                    const volatile value_type & input )
+  { update |= input; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0; }
+
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  void testit()
+  {
+
+    int error_flag = 0;
+
+    Kokkos::parallel_reduce( 1, *this, error_flag );
+
+    ASSERT_EQ( error_flag, 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type, value_type & update ) const
+  {
+    long offset = -1;
+
+    for ( unsigned i1 = 0; i1 < unsigned( left.extent(1) ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.extent(0) ); ++i0 )
+    {
+      const long j = & left( i0, i1 ) -
+                     & left(  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
+    }
+
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.extent(0) ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.extent(1) ); ++i1 )
+    {
+      const long j = & right( i0, i1 ) -
+                     & right(  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
+    }
+
+    for ( unsigned i0 = 0; i0 < unsigned( left.extent(0) ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.extent(1) ); ++i1 )
+    {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+      if ( & left( i0, i1 )  != & left( i0, i1, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( & right( i0, i1 ) != & right( i0, i1, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+#else
+      if ( & left( i0, i1 )  != & left.access( i0, i1, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( & right( i0, i1 ) != & right.access( i0, i1, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+#endif
+    }
+  }
+};
+
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 1 >
+{
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
+
+  typedef int value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update,
+                    const volatile value_type & input )
+  { update |= input; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0; }
+
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
+
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  void testit()
+  {
+    TestViewOperator_LeftAndRight driver;
+
+    int error_flag = 0;
+
+    Kokkos::parallel_reduce( 1, *this, error_flag );
+
+    ASSERT_EQ( error_flag, 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type, value_type & update ) const
+  {
+    for ( unsigned i0 = 0; i0 < unsigned( left.extent(0) ); ++i0 )
+    {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+      if ( & left( i0 )  != & left( i0, 0, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( & right( i0 ) != & right( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+#else
+      if ( & left( i0 )  != & left.access( i0, 0, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( & right( i0 ) != & right.access( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+#endif
+      if ( & left( i0 )  != & left_stride( i0 ) ) { update |= 4; }
+      if ( & right( i0 ) != & right_stride( i0 ) ) { update |= 8; }
+    }
+  }
+};
+
+template< class Layout, class DeviceType >
+struct TestViewMirror
+{
+  template< class MemoryTraits >
+  void static test_mirror() {
+    Kokkos::View< double*, Layout, Kokkos::HostSpace > a_org( "A", 1000 );
+    Kokkos::View< double*, Layout, Kokkos::HostSpace, MemoryTraits > a_h = a_org;
+    auto a_h2 = Kokkos::create_mirror( Kokkos::HostSpace(), a_h );
+    auto a_d = Kokkos::create_mirror( DeviceType(), a_h );
+
+    int equal_ptr_h_h2 = ( a_h.data()  == a_h2.data() ) ? 1 : 0;
+    int equal_ptr_h_d  = ( a_h.data()  ==  a_d.data() ) ? 1 : 0;
+    int equal_ptr_h2_d = ( a_h2.data() ==  a_d.data() ) ? 1 : 0;
+
+    ASSERT_EQ( equal_ptr_h_h2, 0 );
+    ASSERT_EQ( equal_ptr_h_d, 0 );
+    ASSERT_EQ( equal_ptr_h2_d, 0 );
+
+    ASSERT_EQ( a_h.extent(0), a_h2.extent(0) );
+    ASSERT_EQ( a_h.extent(0), a_d .extent(0) );
+  }
+
+  template< class MemoryTraits >
+  void static test_mirror_view() {
+    Kokkos::View< double*, Layout, Kokkos::HostSpace > a_org( "A", 1000 );
+    Kokkos::View< double*, Layout, Kokkos::HostSpace, MemoryTraits > a_h = a_org;
+    auto a_h2 = Kokkos::create_mirror_view( Kokkos::HostSpace(), a_h );
+    auto a_d = Kokkos::create_mirror_view( DeviceType(), a_h );
+
+    int equal_ptr_h_h2 = a_h.data()  == a_h2.data() ? 1 : 0;
+    int equal_ptr_h_d  = a_h.data()  ==  a_d.data() ? 1 : 0;
+    int equal_ptr_h2_d = a_h2.data() ==  a_d.data() ? 1 : 0;
+
+    int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
+    ASSERT_EQ( equal_ptr_h_h2, 1 );
+    ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
+    ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
+
+    ASSERT_EQ( a_h.extent(0), a_h2.extent(0) );
+    ASSERT_EQ( a_h.extent(0), a_d .extent(0) );
+  }
+
+  template< class MemoryTraits >
+  void static test_mirror_copy() {
+    Kokkos::View< double*, Layout, Kokkos::HostSpace > a_org( "A", 10 );
+    a_org(5) = 42.0;
+    Kokkos::View< double*, Layout, Kokkos::HostSpace, MemoryTraits > a_h = a_org;
+    auto a_h2 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_h );
+    auto a_d = Kokkos::create_mirror_view_and_copy( DeviceType(), a_h );
+    auto a_h3 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_d );
+
+    int equal_ptr_h_h2 = a_h.data()  == a_h2.data() ? 1 : 0;
+    int equal_ptr_h_d  = a_h.data()  ==  a_d.data() ? 1 : 0;
+    int equal_ptr_h2_d = a_h2.data() ==  a_d.data() ? 1 : 0;
+    int equal_ptr_h3_d = a_h3.data() ==  a_d.data() ? 1 : 0;
+
+    int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
+    ASSERT_EQ( equal_ptr_h_h2, 1 );
+    ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
+    ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
+    ASSERT_EQ( equal_ptr_h3_d, is_same_memspace );
+
+    ASSERT_EQ( a_h.extent(0), a_h3.extent(0) );
+    ASSERT_EQ( a_h.extent(0), a_h2.extent(0) );
+    ASSERT_EQ( a_h.extent(0), a_d .extent(0) );
+    ASSERT_EQ( a_org(5), a_h3(5) );
+  }
+
+
+  void static testit() {
+    test_mirror< Kokkos::MemoryTraits<0> >();
+    test_mirror< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
+    test_mirror_view< Kokkos::MemoryTraits<0> >();
+    test_mirror_view< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
+    test_mirror_copy< Kokkos::MemoryTraits<0> >();
+    test_mirror_copy< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< typename T, class DeviceType >
+class TestViewAPI
+{
+public:
+  typedef DeviceType device;
+
+  enum { N0 = 1000,
+         N1 = 3,
+         N2 = 5,
+         N3 = 7 };
+
+  typedef Kokkos::View< T, device > dView0;
+  typedef Kokkos::View< T*, device > dView1;
+  typedef Kokkos::View< T*[N1], device > dView2;
+  typedef Kokkos::View< T*[N1][N2], device > dView3;
+  typedef Kokkos::View< T*[N1][N2][N3], device > dView4;
+  typedef Kokkos::View< const T*[N1][N2][N3], device > const_dView4;
+  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged;
+  typedef typename dView0::host_mirror_space host;
+
+  TestViewAPI()
+  {
+    run_test_mirror();
+    run_test();
+    run_test_scalar();
+    run_test_const();
+    run_test_subview();
+    run_test_subview_strided();
+    run_test_vector();
+
+
+    {TestViewOperator< T, device > f; Kokkos::parallel_for(int(N0),f);}
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2][3], device > f8; f8.testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2], device > f7; f7.testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4], device >f6; f6.testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3], device >f5; f5.testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2], device >f4; f4.testit();
+    TestViewOperator_LeftAndRight< int[2][3][4], device >f3; f3.testit();
+    TestViewOperator_LeftAndRight< int[2][3], device >f2; f2.testit();
+    TestViewOperator_LeftAndRight< int[2], device >f1; f1.testit();
+#endif
+    TestViewMirror< Kokkos::LayoutLeft, device >::testit();
+    TestViewMirror< Kokkos::LayoutRight, device >::testit();
+  }
+
+  static void run_test_mirror()
+  {
+    typedef Kokkos::View< int, host > view_type;
+    typedef typename view_type::HostMirror mirror_type;
+
+    static_assert( std::is_same< typename view_type::memory_space, typename mirror_type::memory_space >::value, "" );
+
+    view_type a( "a" );
+    mirror_type am = Kokkos::create_mirror_view( a );
+    mirror_type ax = Kokkos::create_mirror( a );
+    ASSERT_EQ( & a(), & am() );
+  }
+
+  static void run_test_scalar()
+  {
+    typedef typename dView0::HostMirror  hView0;
+
+    dView0 dx, dy;
+    hView0 hx, hy;
+
+    dx = dView0( "dx" );
+    dy = dView0( "dy" );
+
+    hx = Kokkos::create_mirror( dx );
+    hy = Kokkos::create_mirror( dy );
+
+    hx() = 1;
+
+    Kokkos::deep_copy( dx, hx );
+    Kokkos::deep_copy( dy, dx );
+    Kokkos::deep_copy( hy, dy );
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+    ASSERT_EQ( hx(), hy() );
+#endif
+  }
+
+  static void run_test()
+  {
+    // mfh 14 Feb 2014: This test doesn't actually create instances of
+    // these types.  In order to avoid "declared but unused typedef"
+    // warnings, we declare empty instances of these types, with the
+    // usual "(void)" marker to avoid compiler warnings for unused
+    // variables.
+
+    typedef typename dView0::HostMirror  hView0;
+    typedef typename dView1::HostMirror  hView1;
+    typedef typename dView2::HostMirror  hView2;
+    typedef typename dView3::HostMirror  hView3;
+    typedef typename dView4::HostMirror  hView4;
+
+    {
+      hView0 thing;
+      (void) thing;
+    }
+    {
+      hView1 thing;
+      (void) thing;
+    }
+    {
+      hView2 thing;
+      (void) thing;
+    }
+    {
+      hView3 thing;
+      (void) thing;
+    }
+    {
+      hView4 thing;
+      (void) thing;
+    }
+
+    dView4 dx, dy, dz;
+    hView4 hx, hy, hz;
+
+    ASSERT_TRUE( dx.data() == 0 );
+    ASSERT_TRUE( dy.data() == 0 );
+    ASSERT_TRUE( dz.data() == 0 );
+    ASSERT_TRUE( hx.data() == 0 );
+    ASSERT_TRUE( hy.data() == 0 );
+    ASSERT_TRUE( hz.data() == 0 );
+    ASSERT_EQ( dx.extent(0), 0u );
+    ASSERT_EQ( dy.extent(0), 0u );
+    ASSERT_EQ( dz.extent(0), 0u );
+    ASSERT_EQ( hx.extent(0), 0u );
+    ASSERT_EQ( hy.extent(0), 0u );
+    ASSERT_EQ( hz.extent(0), 0u );
+    ASSERT_EQ( dx.extent(1), unsigned( N1 ) );
+    ASSERT_EQ( dy.extent(1), unsigned( N1 ) );
+    ASSERT_EQ( dz.extent(1), unsigned( N1 ) );
+    ASSERT_EQ( hx.extent(1), unsigned( N1 ) );
+    ASSERT_EQ( hy.extent(1), unsigned( N1 ) );
+    ASSERT_EQ( hz.extent(1), unsigned( N1 ) );
+
+    dx = dView4( "dx", N0 );
+    dy = dView4( "dy", N0 );
+
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
+
+    dView4_unmanaged unmanaged_dx = dx;
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
+
+    dView4_unmanaged unmanaged_from_ptr_dx = dView4_unmanaged( dx.data(),
+                                                               dx.extent(0),
+                                                               dx.extent(1),
+                                                               dx.extent(2),
+                                                               dx.extent(3) );
+
+    {
+      // Destruction of this view should be harmless.
+      const_dView4 unmanaged_from_ptr_const_dx( dx.data(),
+                                                dx.extent(0),
+                                                dx.extent(1),
+                                                dx.extent(2),
+                                                dx.extent(3) );
+    }
+
+    const_dView4 const_dx = dx;
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
+
+    {
+      const_dView4 const_dx2;
+      const_dx2 = const_dx;
+      ASSERT_EQ( dx.use_count(), size_t( 3 ) );
+
+      const_dx2 = dy;
+      ASSERT_EQ( dx.use_count(), size_t( 2 ) );
+
+      const_dView4 const_dx3( dx );
+      ASSERT_EQ( dx.use_count(), size_t( 3 ) );
+
+      dView4_unmanaged dx4_unmanaged( dx );
+      ASSERT_EQ( dx.use_count(), size_t( 3 ) );
+    }
+
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
+
+    ASSERT_FALSE( dx.data() == 0 );
+    ASSERT_FALSE( const_dx.data() == 0 );
+    ASSERT_FALSE( unmanaged_dx.data() == 0 );
+    ASSERT_FALSE( unmanaged_from_ptr_dx.data() == 0 );
+    ASSERT_FALSE( dy.data() == 0 );
+    ASSERT_NE( dx, dy );
+
+    ASSERT_EQ( dx.extent(0), unsigned( N0 ) );
+    ASSERT_EQ( dx.extent(1), unsigned( N1 ) );
+    ASSERT_EQ( dx.extent(2), unsigned( N2 ) );
+    ASSERT_EQ( dx.extent(3), unsigned( N3 ) );
+
+    ASSERT_EQ( dy.extent(0), unsigned( N0 ) );
+    ASSERT_EQ( dy.extent(1), unsigned( N1 ) );
+    ASSERT_EQ( dy.extent(2), unsigned( N2 ) );
+    ASSERT_EQ( dy.extent(3), unsigned( N3 ) );
+
+    ASSERT_EQ( unmanaged_from_ptr_dx.span(), unsigned( N0 ) * unsigned( N1 ) * unsigned( N2 ) * unsigned( N3 ) );
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+return;
+#endif
+    hx = Kokkos::create_mirror( dx );
+    hy = Kokkos::create_mirror( dy );
+
+    // T v1 = hx();       // Generates compile error as intended.
+    // T v2 = hx( 0, 0 ); // Generates compile error as intended.
+    // hx( 0, 0 ) = v2;   // Generates compile error as intended.
+
+    // Testing with asynchronous deep copy with respect to device
+    {
+      size_t count = 0;
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < hx.extent(1); ++i1 )
+      for ( size_t i2 = 0; i2 < hx.extent(2); ++i2 )
+      for ( size_t i3 = 0; i3 < hx.extent(3); ++i3 )
+      {
+        hx( ip, i1, i2, i3 ) = ++count;
+      }
+
+      Kokkos::deep_copy( typename hView4::execution_space(), dx, hx );
+      Kokkos::deep_copy( typename hView4::execution_space(), dy, dx );
+      Kokkos::deep_copy( typename hView4::execution_space(), hy, dy );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), hy( ip, i1, i2, i3 ) );
+      }
+
+      Kokkos::deep_copy( typename hView4::execution_space(), dx, T( 0 ) );
+      Kokkos::deep_copy( typename hView4::execution_space(), hx, dx );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), T( 0 ) );
+      }
+    }
+
+    // Testing with asynchronous deep copy with respect to host.
+    {
+      size_t count = 0;
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < hx.extent(1); ++i1 )
+      for ( size_t i2 = 0; i2 < hx.extent(2); ++i2 )
+      for ( size_t i3 = 0; i3 < hx.extent(3); ++i3 )
+      {
+        hx( ip, i1, i2, i3 ) = ++count;
+      }
+
+      Kokkos::deep_copy( typename dView4::execution_space(), dx, hx );
+      Kokkos::deep_copy( typename dView4::execution_space(), dy, dx );
+      Kokkos::deep_copy( typename dView4::execution_space(), hy, dy );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), hy( ip, i1, i2, i3 ) );
+      }
+
+      Kokkos::deep_copy( typename dView4::execution_space(), dx, T( 0 ) );
+      Kokkos::deep_copy( typename dView4::execution_space(), hx, dx );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), T( 0 ) );
+      }
+    }
+
+    // Testing with synchronous deep copy.
+    {
+      size_t count = 0;
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < hx.extent(1); ++i1 )
+      for ( size_t i2 = 0; i2 < hx.extent(2); ++i2 )
+      for ( size_t i3 = 0; i3 < hx.extent(3); ++i3 )
+      {
+        hx( ip, i1, i2, i3 ) = ++count;
+      }
+
+      Kokkos::deep_copy( dx, hx );
+      Kokkos::deep_copy( dy, dx );
+      Kokkos::deep_copy( hy, dy );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), hy( ip, i1, i2, i3 ) );
+      }
+
+      Kokkos::deep_copy( dx, T( 0 ) );
+      Kokkos::deep_copy( hx, dx );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), T( 0 ) );
+      }
+    }
+
+    dz = dx;
+    ASSERT_EQ( dx, dz );
+    ASSERT_NE( dy, dz );
+
+    dz = dy;
+    ASSERT_EQ( dy, dz );
+    ASSERT_NE( dx, dz );
+
+    dx = dView4();
+    ASSERT_TRUE( dx.data() == 0 );
+    ASSERT_FALSE( dy.data() == 0 );
+    ASSERT_FALSE( dz.data() == 0 );
+
+    dy = dView4();
+    ASSERT_TRUE( dx.data() == 0 );
+    ASSERT_TRUE( dy.data() == 0 );
+    ASSERT_FALSE( dz.data() == 0 );
+
+    dz = dView4();
+    ASSERT_TRUE( dx.data() == 0 );
+    ASSERT_TRUE( dy.data() == 0 );
+    ASSERT_TRUE( dz.data() == 0 );
+
+    // Check Deep Copy of LayoutLeft to LayoutRight
+    {
+      Kokkos::View<double*,Kokkos::LayoutLeft> dll("dll",10);
+      Kokkos::View<double*,Kokkos::LayoutRight,Kokkos::HostSpace> hlr("hlr",10);
+      Kokkos::deep_copy(dll,hlr);
+      Kokkos::deep_copy(hlr,dll);
+    }
+
+    // Check Deep Copy of two empty 1D views
+    {
+      Kokkos::View<double*> d;
+      Kokkos::View<double*,Kokkos::HostSpace> h;
+      Kokkos::deep_copy(d,h);
+      Kokkos::deep_copy(h,d);
+    }
+
+    // Check Deep Copy of two empty 2D views
+    {
+      Kokkos::View<double*[3],Kokkos::LayoutRight> d;
+      Kokkos::View<double*[3],Kokkos::LayoutRight,Kokkos::HostSpace> h;
+      Kokkos::deep_copy(d,h);
+      Kokkos::deep_copy(h,d);
+    }
+
+  }
+
+  typedef T DataType[2];
+
+  static void
+  check_auto_conversion_to_const(
+     const Kokkos::View< const DataType, device > & arg_const,
+     const Kokkos::View< DataType, device > & arg )
+  {
+    ASSERT_TRUE( arg_const == arg );
+  }
+
+  static void run_test_const()
+  {
+    typedef Kokkos::View< DataType, device > typeX;
+    typedef Kokkos::View< const DataType, device > const_typeX;
+    typedef Kokkos::View< const DataType, device, Kokkos::MemoryRandomAccess > const_typeR;
+
+    typeX x( "X" );
+    const_typeX xc = x;
+    const_typeR xr = x;
+
+    ASSERT_TRUE( xc == x );
+    ASSERT_TRUE( x == xc );
+
+    // For CUDA the constant random access View does not return
+    // an lvalue reference due to retrieving through texture cache
+    // therefore not allowed to query the underlying pointer.
+#if defined( KOKKOS_ENABLE_CUDA )
+    if ( !std::is_same< typename device::execution_space, Kokkos::Cuda >::value )
+#endif
+    {
+      ASSERT_TRUE( x.data() == xr.data() );
+    }
+
+    // typeX xf = xc; // Setting non-const from const must not compile.
+
+    check_auto_conversion_to_const( x, x );
+  }
+
+  static void run_test_subview()
+  {
+    typedef Kokkos::View< const T, device > sView;
+
+    dView0 d0( "d0" );
+    dView1 d1( "d1", N0 );
+    dView2 d2( "d2", N0 );
+    dView3 d3( "d3", N0 );
+    dView4 d4( "d4", N0 );
+
+    sView s0 = d0;
+    sView s1 = Kokkos::subview( d1, 1 );
+    sView s2 = Kokkos::subview( d2, 1, 1 );
+    sView s3 = Kokkos::subview( d3, 1, 1, 1 );
+    sView s4 = Kokkos::subview( d4, 1, 1, 1, 1 );
+  }
+
+  static void run_test_subview_strided()
+  {
+    typedef Kokkos::View< int ****, Kokkos::LayoutLeft , host >  view_left_4;
+    typedef Kokkos::View< int ****, Kokkos::LayoutRight, host >  view_right_4;
+    typedef Kokkos::View< int **  , Kokkos::LayoutLeft , host >  view_left_2;
+    typedef Kokkos::View< int **  , Kokkos::LayoutRight, host >  view_right_2;
+
+    typedef Kokkos::View< int * ,  Kokkos::LayoutStride, host >  view_stride_1;
+    typedef Kokkos::View< int **,  Kokkos::LayoutStride, host >  view_stride_2;
+
+    view_left_2  xl2( "xl2", 100, 200 );
+    view_right_2 xr2( "xr2", 100, 200 );
+    view_stride_1 yl1 = Kokkos::subview( xl2, 0, Kokkos::ALL() );
+    view_stride_1 yl2 = Kokkos::subview( xl2, 1, Kokkos::ALL() );
+    view_stride_1 yr1 = Kokkos::subview( xr2, 0, Kokkos::ALL() );
+    view_stride_1 yr2 = Kokkos::subview( xr2, 1, Kokkos::ALL() );
+
+    ASSERT_EQ( yl1.extent(0), xl2.extent(1) );
+    ASSERT_EQ( yl2.extent(0), xl2.extent(1) );
+    ASSERT_EQ( yr1.extent(0), xr2.extent(1) );
+    ASSERT_EQ( yr2.extent(0), xr2.extent(1) );
+
+    ASSERT_EQ( & yl1( 0 ) - & xl2( 0, 0 ), 0 );
+    ASSERT_EQ( & yl2( 0 ) - & xl2( 1, 0 ), 0 );
+    ASSERT_EQ( & yr1( 0 ) - & xr2( 0, 0 ), 0 );
+    ASSERT_EQ( & yr2( 0 ) - & xr2( 1, 0 ), 0 );
+
+    view_left_4 xl4( "xl4", 10, 20, 30, 40 );
+    view_right_4 xr4( "xr4", 10, 20, 30, 40 );
+
+    view_stride_2 yl4 = Kokkos::subview( xl4, 1, Kokkos::ALL(), 2, Kokkos::ALL() );
+    view_stride_2 yr4 = Kokkos::subview( xr4, 1, Kokkos::ALL(), 2, Kokkos::ALL() );
+
+    ASSERT_EQ( yl4.extent(0), xl4.extent(1) );
+    ASSERT_EQ( yl4.extent(1), xl4.extent(3) );
+    ASSERT_EQ( yr4.extent(0), xr4.extent(1) );
+    ASSERT_EQ( yr4.extent(1), xr4.extent(3) );
+
+    ASSERT_EQ( & yl4( 4, 4 ) - & xl4( 1, 4, 2, 4 ), 0 );
+    ASSERT_EQ( & yr4( 4, 4 ) - & xr4( 1, 4, 2, 4 ), 0 );
+  }
+
+  static void run_test_vector()
+  {
+    static const unsigned Length = 1000, Count = 8;
+
+    typedef Kokkos::View< T*,  Kokkos::LayoutLeft, host > vector_type;
+    typedef Kokkos::View< T**, Kokkos::LayoutLeft, host > multivector_type;
+
+    typedef Kokkos::View< T*,  Kokkos::LayoutRight, host > vector_right_type;
+    typedef Kokkos::View< T**, Kokkos::LayoutRight, host > multivector_right_type;
+
+    typedef Kokkos::View< const T*,  Kokkos::LayoutRight, host > const_vector_right_type;
+    typedef Kokkos::View< const T*,  Kokkos::LayoutLeft,  host > const_vector_type;
+    typedef Kokkos::View< const T**, Kokkos::LayoutLeft,  host > const_multivector_type;
+
+    multivector_type mv = multivector_type( "mv", Length, Count );
+    multivector_right_type mv_right = multivector_right_type( "mv", Length, Count );
+
+    vector_type v1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    vector_type v2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    vector_type v3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
+
+    vector_type rv1 = Kokkos::subview( mv_right, 0, Kokkos::ALL() );
+    vector_type rv2 = Kokkos::subview( mv_right, 1, Kokkos::ALL() );
+    vector_type rv3 = Kokkos::subview( mv_right, 2, Kokkos::ALL() );
+
+    multivector_type mv1 = Kokkos::subview( mv, std::make_pair( 1, 998 ),
+                                                std::make_pair( 2, 5 ) );
+
+    multivector_right_type mvr1 = Kokkos::subview( mv_right, std::make_pair( 1, 998 ),
+                                                             std::make_pair( 2, 5 ) );
+
+    const_vector_type cv1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    const_vector_type cv2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    const_vector_type cv3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
+
+    vector_right_type vr1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    vector_right_type vr2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    vector_right_type vr3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
+
+    const_vector_right_type cvr1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    const_vector_right_type cvr2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    const_vector_right_type cvr3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
+
+    ASSERT_TRUE( & v1[0] == & v1( 0 ) );
+    ASSERT_TRUE( & v1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & v2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & v3[0] == & mv( 0, 2 ) );
+
+    ASSERT_TRUE( & cv1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & cv2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & cv3[0] == & mv( 0, 2 ) );
+
+    ASSERT_TRUE( & vr1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & vr2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & vr3[0] == & mv( 0, 2 ) );
+
+    ASSERT_TRUE( & cvr1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & cvr2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & cvr3[0] == & mv( 0, 2 ) );
+
+    ASSERT_TRUE( & mv1( 0, 0 ) == & mv( 1, 2 ) );
+    ASSERT_TRUE( & mv1( 1, 1 ) == & mv( 2, 3 ) );
+    ASSERT_TRUE( & mv1( 3, 2 ) == & mv( 4, 4 ) );
+    ASSERT_TRUE( & mvr1( 0, 0 ) == & mv_right( 1, 2 ) );
+    ASSERT_TRUE( & mvr1( 1, 1 ) == & mv_right( 2, 3 ) );
+    ASSERT_TRUE( & mvr1( 3, 2 ) == & mv_right( 4, 4 ) );
+
+    const_vector_type c_cv1( v1 );
+    typename vector_type::const_type c_cv2( v2 );
+    typename const_vector_type::const_type c_ccv2( v2 );
+
+    const_multivector_type cmv( mv );
+    typename multivector_type::const_type cmvX( cmv );
+    typename const_multivector_type::const_type ccmvX( cmv );
+  }
+};
+
+#if !defined(KOKKOS_ENABLE_ROCM)
+TEST_F( TEST_CATEGORY, view_api )
+{
+  TestViewAPI< double, TEST_EXECSPACE >();
+}
+#endif
+
+TEST_F( TEST_CATEGORY, view_remap )
+{
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  #ifdef KOKKOS_ENABLE_CUDA
+    #define EXECSPACE std::conditional<std::is_same<TEST_EXECSPACE,Kokkos::Cuda>::value,Kokkos::CudaHostPinnedSpace,TEST_EXECSPACE>::type
+  #else
+    #ifdef KOKKOS_ENABLE_ROCM
+      #define EXECSPACE std::conditional<std::is_same<TEST_EXECSPACE,Kokkos::Experimental::ROCm>::value,Kokkos::Experimental::ROCmHostPinnedSpace,TEST_EXECSPACE>::type
+    #else
+      #if defined(KOKKOS_ENABLE_OPENMPTARGET)
+        #define EXECSPACE Kokkos::HostSpace
+      #else
+        #define EXECSPACE TEST_EXECSPACE
+      #endif
+    #endif
+  #endif
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        EXECSPACE > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        EXECSPACE > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        EXECSPACE > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  Kokkos::fence();
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
+  Kokkos::fence();
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+  Kokkos::fence();
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
+}
+
+TEST_F( TEST_CATEGORY, view_mirror_nonconst )
+{
+  Kokkos::View<int*, TEST_EXECSPACE> d_view("d_view", 10);
+  Kokkos::View<const int*, TEST_EXECSPACE> d_view_const = d_view;
+  auto h_view = Kokkos::create_mirror(d_view_const);
+  Kokkos::deep_copy(h_view, d_view_const);
+  auto h_view2 = Kokkos::create_mirror(Kokkos::HostSpace(), d_view_const);
+  Kokkos::deep_copy(h_view2, d_view_const);
+}
+
+template <typename DataType, typename ... Extents>
+void test_left_stride(Extents ... extents) {
+  using view_type = Kokkos::View<DataType, Kokkos::LayoutLeft, Kokkos::HostSpace>;
+  view_type view("view", extents...);
+  size_t expected_stride = 1;
+  size_t all_strides[view_type::rank + 1];
+  view.stride(all_strides);
+  for (int i = 0; i < view_type::rank; ++i) {
+    ASSERT_EQ(view.stride(i), expected_stride);
+    ASSERT_EQ(all_strides[i], expected_stride);
+    expected_stride *= view.extent(i);
+  }
+}
+
+template <typename DataType, typename ... Extents>
+void test_right_stride(Extents ... extents) {
+  using view_type = Kokkos::View<DataType, Kokkos::LayoutRight, Kokkos::HostSpace>;
+  view_type view("view", extents...);
+  size_t expected_stride = 1;
+  size_t all_strides[view_type::rank + 1];
+  view.stride(all_strides);
+  for (int ri = 0; ri < view_type::rank; ++ri) {
+    auto i = view_type::rank - 1 - ri;
+    ASSERT_EQ(view.stride(i), expected_stride);
+    ASSERT_EQ(all_strides[i], expected_stride);
+    expected_stride *= view.extent(i);
+  }
+}
+
+template <typename DataType, typename ... Extents>
+void test_stride(Extents ... extents) {
+  test_right_stride<DataType>(extents...);
+  test_left_stride<DataType>(extents...);
+}
+
+TEST_F( TEST_CATEGORY, view_stride_method )
+{
+  test_stride<double[3]>();
+  test_stride<double*>(3);
+  test_stride<double[3][7][13]>();
+  test_stride<double***>(3, 7, 13);
+  // factorial(8) = 40320
+  test_stride<double[1][2][3][4][5][6][7][8]>();
+  test_stride<double********>(1, 2, 3, 4, 5, 6, 7, 8);
+}
+
+inline void test_anonymous_space() {
+  /* apparently TEST_EXECSPACE is sometimes a memory space. */
+  using ExecSpace = TEST_EXECSPACE::execution_space;
+  int host_array[10];
+  Kokkos::View<int[10], Kokkos::AnonymousSpace> host_anon_stat_view(host_array);
+  Kokkos::View<int*, Kokkos::AnonymousSpace> host_anon_dyn_view(host_array, 10);
+  Kokkos::View<int*, Kokkos::HostSpace> host_view("host_view", 10);
+  Kokkos::View<int*, Kokkos::AnonymousSpace> host_anon_assign_view = host_view;
+  for (int i = 0; i < 10; ++i) {
+    host_anon_stat_view(i) = host_anon_dyn_view(i) = 142;
+    host_anon_assign_view(i) = 142;
+  }
+  Kokkos::View<int**, Kokkos::LayoutRight, ExecSpace> d_view("d_view", 100, 10);
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+  Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, int>(0, 100), KOKKOS_LAMBDA(int i) {
+    int* ptr = &(d_view(i, 0));
+    Kokkos::View<int[10], Kokkos::AnonymousSpace> d_anon_stat_view(ptr);
+    Kokkos::View<int*, Kokkos::AnonymousSpace> d_anon_dyn_view(ptr, 10);
+    auto sub = Kokkos::subview(d_view, i, Kokkos::ALL());
+    Kokkos::View<int*, Kokkos::AnonymousSpace> d_anon_assign_view = sub;
+    for (int j = 0; j < 10; ++j) {
+      d_anon_stat_view(j) = 50;
+      d_anon_assign_view(j) += 50;
+      d_anon_dyn_view(j) += 42;
+    }
+  });
+#endif
+}
+
+TEST_F( TEST_CATEGORY, anonymous_space )
+{
+  test_anonymous_space();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp b/packages/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b880521293d47e465f6a898e08f4188e35b2f35
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp
@@ -0,0 +1,161 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <type_traits>
+#include <typeinfo>
+
+namespace Test {
+
+namespace {
+
+template <typename ExecSpace >
+struct TestViewCtorProp_EmbeddedDim {
+
+  using ViewIntType     = typename Kokkos::View< int**, ExecSpace >;
+  using ViewDoubleType     = typename Kokkos::View< double*, ExecSpace >;
+
+  // Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor
+  template < class ViewType >
+  struct Functor {
+
+    ViewType v;
+
+    Functor( const ViewType & v_ ) : v(v_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int i ) const {
+      v(i) = i;
+    }
+
+  };
+
+
+  static void test_vcpt( const int N0, const int N1 )
+  {
+
+    // Create views to test
+    {
+      using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ;
+      using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ;
+
+      VIT vi1("vi1", N0, N1);
+      VDT vd1("vd1", N0);
+
+      // TEST: Test for common type between two views, one with type double, other with type int
+      // Deduce common value_type and construct a view with that type
+      {
+        // Two views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
+        ASSERT_EQ( (std::is_same< typename decltype(view_alloc_arg)::scalar_array_type, CommonViewValueType>::value) , true ) ;
+      #if 0
+      // debug output
+      for ( int i = 0; i < N0*N1; ++i ) {
+        printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) );
+      }
+
+      printf( " Common value type view: %s \n", typeid( CVT() ).name() );
+      printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() );
+      if ( std::is_same< CommonViewValueType, double >::value == true ) {
+        printf("Proper common value_type\n");
+      }
+      else {
+        printf("WRONG common value_type\n");
+      }
+      // end debug output
+      #endif
+      }
+
+      {
+        // Single view
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
+      }
+
+    }
+
+  } // end test_vcpt
+
+}; // end struct
+
+} // namespace
+
+TEST_F( TEST_CATEGORY , viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< TEST_EXECSPACE >::test_vcpt( 2, 3 );
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff3f3cd4284da9e0d4d888f31a465fe0f741c615
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
@@ -0,0 +1,1204 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+template< class Space >
+void test_view_mapping()
+{
+  typedef typename Space::execution_space ExecSpace;
+
+  typedef Kokkos::Impl::ViewDimension<>  dim_0;
+  typedef Kokkos::Impl::ViewDimension< 2 > dim_s2;
+  typedef Kokkos::Impl::ViewDimension< 2, 3 > dim_s2_s3;
+  typedef Kokkos::Impl::ViewDimension< 2, 3, 4 > dim_s2_s3_s4;
+
+  typedef Kokkos::Impl::ViewDimension< 0 > dim_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 3 > dim_s0_s3;
+  typedef Kokkos::Impl::ViewDimension< 0, 3, 4 > dim_s0_s3_s4;
+
+  typedef Kokkos::Impl::ViewDimension< 0, 0 > dim_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 4 > dim_s0_s0_s4;
+
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0 > dim_s0_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0 > dim_s0_s0_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0_s0;
+
+  // Fully static dimensions should not be larger than an int.
+  ASSERT_LE( sizeof( dim_0 ), sizeof( int ) );
+  ASSERT_LE( sizeof( dim_s2 ), sizeof( int ) );
+  ASSERT_LE( sizeof( dim_s2_s3 ), sizeof( int ) );
+  ASSERT_LE( sizeof( dim_s2_s3_s4 ), sizeof( int ) );
+
+  // Rank 1 is size_t.
+  ASSERT_EQ( sizeof( dim_s0 ), sizeof( size_t ) );
+  ASSERT_EQ( sizeof( dim_s0_s3 ), sizeof( size_t ) );
+  ASSERT_EQ( sizeof( dim_s0_s3_s4 ), sizeof( size_t ) );
+
+  // Allow for padding.
+  ASSERT_LE( sizeof( dim_s0_s0 ), 2 * sizeof( size_t ) );
+  ASSERT_LE( sizeof( dim_s0_s0_s4 ), 2 * sizeof( size_t ) );
+
+  ASSERT_LE( sizeof( dim_s0_s0_s0 ), 4 * sizeof( size_t ) );
+  ASSERT_EQ( sizeof( dim_s0_s0_s0_s0 ), 4 * sizeof( unsigned ) );
+  ASSERT_LE( sizeof( dim_s0_s0_s0_s0_s0 ), 6 * sizeof( unsigned ) );
+  ASSERT_EQ( sizeof( dim_s0_s0_s0_s0_s0_s0 ), 6 * sizeof( unsigned ) );
+  ASSERT_LE( sizeof( dim_s0_s0_s0_s0_s0_s0_s0 ), 8 * sizeof( unsigned ) );
+  ASSERT_EQ( sizeof( dim_s0_s0_s0_s0_s0_s0_s0_s0 ), 8 * sizeof( unsigned ) );
+
+  static_assert( int( dim_0::rank ) == int( 0 ), "" );
+  static_assert( int( dim_0::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_0::ArgN0 ) == 1, "" );
+  static_assert( int( dim_0::ArgN1 ) == 1, "" );
+  static_assert( int( dim_0::ArgN2 ) == 1, "" );
+
+  static_assert( int( dim_s2::rank ) == int( 1 ), "" );
+  static_assert( int( dim_s2::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_s2::ArgN0 ) == 2, "" );
+  static_assert( int( dim_s2::ArgN1 ) == 1, "" );
+
+  static_assert( int( dim_s2_s3::rank ) == int( 2 ), "" );
+  static_assert( int( dim_s2_s3::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_s2_s3::ArgN0 ) == 2, "" );
+  static_assert( int( dim_s2_s3::ArgN1 ) == 3, "" );
+  static_assert( int( dim_s2_s3::ArgN2 ) == 1, "" );
+
+  static_assert( int( dim_s2_s3_s4::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s2_s3_s4::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_s2_s3_s4::ArgN0 ) == 2, "" );
+  static_assert( int( dim_s2_s3_s4::ArgN1 ) == 3, "" );
+  static_assert( int( dim_s2_s3_s4::ArgN2 ) == 4, "" );
+  static_assert( int( dim_s2_s3_s4::ArgN3 ) == 1, "" );
+
+  static_assert( int( dim_s0::rank ) == int( 1 ), "" );
+  static_assert( int( dim_s0::rank_dynamic ) == int( 1 ), "" );
+
+  static_assert( int( dim_s0_s3::rank ) == int( 2 ), "" );
+  static_assert( int( dim_s0_s3::rank_dynamic ) == int( 1 ), "" );
+  static_assert( int( dim_s0_s3::ArgN0 ) == 0, "" );
+  static_assert( int( dim_s0_s3::ArgN1 ) == 3, "" );
+
+  static_assert( int( dim_s0_s3_s4::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s0_s3_s4::rank_dynamic ) == int( 1 ), "" );
+  static_assert( int( dim_s0_s3_s4::ArgN0 ) == 0, "" );
+  static_assert( int( dim_s0_s3_s4::ArgN1 ) == 3, "" );
+  static_assert( int( dim_s0_s3_s4::ArgN2 ) == 4, "" );
+
+  static_assert( int( dim_s0_s0_s4::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s0_s0_s4::rank_dynamic ) == int( 2 ), "" );
+  static_assert( int( dim_s0_s0_s4::ArgN0 ) == 0, "" );
+  static_assert( int( dim_s0_s0_s4::ArgN1 ) == 0, "" );
+  static_assert( int( dim_s0_s0_s4::ArgN2 ) == 4, "" );
+
+  static_assert( int( dim_s0_s0_s0::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s0_s0_s0::rank_dynamic ) == int( 3 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0::rank ) == int( 4 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0::rank_dynamic ) == int( 4 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0::rank ) == int( 5 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0::rank_dynamic ) == int( 5 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0::rank ) == int( 6 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0::rank_dynamic ) == int( 6 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0::rank ) == int( 7 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic ) == int( 7 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0_s0::rank ) == int( 8 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic ) == int( 8 ), "" );
+
+  dim_s0          d1( 2, 3, 4, 5, 6, 7, 8, 9 );
+  dim_s0_s0       d2( 2, 3, 4, 5, 6, 7, 8, 9 );
+  dim_s0_s0_s0    d3( 2, 3, 4, 5, 6, 7, 8, 9 );
+  dim_s0_s0_s0_s0 d4( 2, 3, 4, 5, 6, 7, 8, 9 );
+
+  ASSERT_EQ( d1.N0, 2 );
+  ASSERT_EQ( d2.N0, 2 );
+  ASSERT_EQ( d3.N0, 2 );
+  ASSERT_EQ( d4.N0, 2 );
+
+  ASSERT_EQ( d1.N1, 1 );
+  ASSERT_EQ( d2.N1, 3 );
+  ASSERT_EQ( d3.N1, 3 );
+  ASSERT_EQ( d4.N1, 3 );
+
+  ASSERT_EQ( d1.N2, 1 );
+  ASSERT_EQ( d2.N2, 1 );
+  ASSERT_EQ( d3.N2, 4 );
+  ASSERT_EQ( d4.N2, 4 );
+
+  ASSERT_EQ( d1.N3, 1 );
+  ASSERT_EQ( d2.N3, 1 );
+  ASSERT_EQ( d3.N3, 1 );
+  ASSERT_EQ( d4.N3, 5 );
+
+  //----------------------------------------
+
+  typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s0, Kokkos::LayoutStride > stride_s0_s0_s0;
+
+  //----------------------------------------
+  // Static dimension.
+  {
+    typedef Kokkos::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutLeft > left_s2_s3_s4;
+
+    ASSERT_EQ( sizeof( left_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) );
+
+    left_s2_s3_s4 off3;
+
+    stride_s0_s0_s0 stride3( off3 );
+
+    ASSERT_EQ( off3.stride_0(), 1 );
+    ASSERT_EQ( off3.stride_1(), 2 );
+    ASSERT_EQ( off3.stride_2(), 6 );
+    ASSERT_EQ( off3.span(), 24 );
+
+    ASSERT_EQ( off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( off3.span(), stride3.span() );
+
+    int offset = 0;
+
+    for ( int k = 0; k < 4; ++k )
+    for ( int j = 0; j < 3; ++j )
+    for ( int i = 0; i < 2; ++i, ++offset )
+    {
+      ASSERT_EQ( off3( i, j, k ), offset );
+      ASSERT_EQ( stride3( i, j, k ), off3( i, j, k ) );
+    }
+  }
+
+  //----------------------------------------
+  // Small dimension is unpadded.
+  {
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
+
+    left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
+                          , Kokkos::LayoutLeft( 2, 3, 0, 0, 0, 0, 0, 0 ) );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, 2 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), 2 * 3 * 4 );
+
+    const Kokkos::LayoutLeft layout = dyn_off3.layout();
+
+    ASSERT_EQ( layout.dimension[0], 2 );
+    ASSERT_EQ( layout.dimension[1], 3 );
+    ASSERT_EQ( layout.dimension[2], 4 );
+    ASSERT_EQ( layout.dimension[3], 1 );
+    ASSERT_EQ( layout.dimension[4], 1 );
+    ASSERT_EQ( layout.dimension[5], 1 );
+    ASSERT_EQ( layout.dimension[6], 1 );
+    ASSERT_EQ( layout.dimension[7], 1 );
+
+    ASSERT_EQ( stride3.m_dim.rank, 3 );
+    ASSERT_EQ( stride3.m_dim.N0, 2 );
+    ASSERT_EQ( stride3.m_dim.N1, 3 );
+    ASSERT_EQ( stride3.m_dim.N2, 4 );
+    ASSERT_EQ( stride3.m_dim.N3, 1 );
+    ASSERT_EQ( stride3.size(), 2 * 3 * 4 );
+
+    int offset = 0;
+
+    for ( int k = 0; k < 4; ++k )
+    for ( int j = 0; j < 3; ++j )
+    for ( int i = 0; i < 2; ++i, ++offset )
+    {
+      ASSERT_EQ( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+    }
+
+    ASSERT_EQ( dyn_off3.span(), offset );
+    ASSERT_EQ( stride3.span(), dyn_off3.span() );
+  }
+
+  //----------------------------------------
+  // Large dimension is likely padded.
+  {
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
+
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
+
+    left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
+                          , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, N0 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, N1 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), N0 * N1 * 4 );
+
+    ASSERT_EQ( stride3.m_dim.rank, 3 );
+    ASSERT_EQ( stride3.m_dim.N0, N0 );
+    ASSERT_EQ( stride3.m_dim.N1, N1 );
+    ASSERT_EQ( stride3.m_dim.N2, 4 );
+    ASSERT_EQ( stride3.m_dim.N3, 1 );
+    ASSERT_EQ( stride3.size(), N0 * N1 * 4 );
+    ASSERT_EQ( stride3.span(), dyn_off3.span() );
+
+    int offset = 0;
+
+    for ( int k = 0; k < 4; ++k )
+    for ( int j = 0; j < N1; ++j )
+    for ( int i = 0; i < N0; ++i )
+    {
+      ASSERT_LE( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+      offset = dyn_off3( i, j, k ) + 1;
+    }
+
+    ASSERT_LE( offset, dyn_off3.span() );
+  }
+
+  //----------------------------------------
+  // Static dimension.
+  {
+    typedef Kokkos::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutRight > right_s2_s3_s4;
+
+    ASSERT_EQ( sizeof( right_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) );
+
+    right_s2_s3_s4 off3;
+
+    stride_s0_s0_s0  stride3( off3 );
+
+    ASSERT_EQ( off3.stride_0(), 12 );
+    ASSERT_EQ( off3.stride_1(), 4 );
+    ASSERT_EQ( off3.stride_2(), 1 );
+
+    ASSERT_EQ( off3.dimension_0(), stride3.dimension_0() );
+    ASSERT_EQ( off3.dimension_1(), stride3.dimension_1() );
+    ASSERT_EQ( off3.dimension_2(), stride3.dimension_2() );
+    ASSERT_EQ( off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( off3.span(), stride3.span() );
+
+    int offset = 0;
+
+    for ( int i = 0; i < 2; ++i )
+    for ( int j = 0; j < 3; ++j )
+    for ( int k = 0; k < 4; ++k, ++offset )
+    {
+      ASSERT_EQ( off3( i, j, k ), offset );
+      ASSERT_EQ( off3( i, j, k ), stride3( i, j, k ) );
+    }
+
+    ASSERT_EQ( off3.span(), offset );
+  }
+
+  //----------------------------------------
+  // Small dimension is unpadded.
+  {
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
+
+    right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
+                           , Kokkos::LayoutRight( 2, 3, 0, 0, 0, 0, 0, 0 ) );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, 2 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), 2 * 3 * 4 );
+
+    ASSERT_EQ( dyn_off3.dimension_0(), stride3.dimension_0() );
+    ASSERT_EQ( dyn_off3.dimension_1(), stride3.dimension_1() );
+    ASSERT_EQ( dyn_off3.dimension_2(), stride3.dimension_2() );
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( dyn_off3.span(), stride3.span() );
+
+    int offset = 0;
+
+    for ( int i = 0; i < 2; ++i )
+    for ( int j = 0; j < 3; ++j )
+    for ( int k = 0; k < 4; ++k, ++offset )
+    {
+      ASSERT_EQ( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( dyn_off3( i, j, k ), stride3( i, j, k ) );
+    }
+
+    ASSERT_EQ( dyn_off3.span(), offset );
+  }
+
+  //----------------------------------------
+  // Large dimension is likely padded.
+  {
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
+
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
+
+    right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
+                           , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, N0 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, N1 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), N0 * N1 * 4 );
+
+    ASSERT_EQ( dyn_off3.dimension_0(), stride3.dimension_0() );
+    ASSERT_EQ( dyn_off3.dimension_1(), stride3.dimension_1() );
+    ASSERT_EQ( dyn_off3.dimension_2(), stride3.dimension_2() );
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( dyn_off3.span(), stride3.span() );
+
+    int offset = 0;
+
+    for ( int i = 0; i < N0; ++i )
+    for ( int j = 0; j < N1; ++j )
+    for ( int k = 0; k < 4; ++k )
+    {
+      ASSERT_LE( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( dyn_off3( i, j, k ), stride3( i, j, k ) );
+      offset = dyn_off3( i, j, k ) + 1;
+    }
+
+    ASSERT_LE( offset, dyn_off3.span() );
+  }
+
+  //----------------------------------------
+  // Subview.
+  {
+    // Mapping rank 4 to rank 3
+    typedef Kokkos::Impl::SubviewExtents< 4, 3 > SubviewExtents;
+
+    constexpr int N0 = 1000;
+    constexpr int N1 = 2000;
+    constexpr int N2 = 3000;
+    constexpr int N3 = 4000;
+
+    Kokkos::Impl::ViewDimension< N0, N1, N2, N3 > dim;
+
+    SubviewExtents tmp( dim
+                      , N0 / 2
+                      , Kokkos::ALL
+                      , std::pair< int, int >( N2 / 4, 10 + N2 / 4 )
+                      , Kokkos::pair< int, int >( N3 / 4, 20 + N3 / 4 )
+                      );
+
+    ASSERT_EQ( tmp.domain_offset( 0 ), N0 / 2 );
+    ASSERT_EQ( tmp.domain_offset( 1 ), 0 );
+    ASSERT_EQ( tmp.domain_offset( 2 ), N2 / 4 );
+    ASSERT_EQ( tmp.domain_offset( 3 ), N3 / 4 );
+
+    ASSERT_EQ( tmp.range_index( 0 ), 1 );
+    ASSERT_EQ( tmp.range_index( 1 ), 2 );
+    ASSERT_EQ( tmp.range_index( 2 ), 3 );
+
+    ASSERT_EQ( tmp.range_extent( 0 ), N1 );
+    ASSERT_EQ( tmp.range_extent( 1 ), 10 );
+    ASSERT_EQ( tmp.range_extent( 2 ), 20 );
+  }
+
+  {
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
+
+    constexpr int sub_N0 = 1000;
+    constexpr int sub_N1 = 200;
+    constexpr int sub_N2 = 4;
+
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
+
+    left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
+                          , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) );
+
+    Kokkos::Impl::SubviewExtents< 3, 3 >
+      sub( dyn_off3.m_dim
+         , Kokkos::pair< int, int >( 0, sub_N0 )
+         , Kokkos::pair< int, int >( 0, sub_N1 )
+         , Kokkos::pair< int, int >( 0, sub_N2 )
+         );
+
+    stride_s0_s0_s0  stride3( dyn_off3, sub );
+
+    ASSERT_EQ( stride3.dimension_0(), sub_N0 );
+    ASSERT_EQ( stride3.dimension_1(), sub_N1 );
+    ASSERT_EQ( stride3.dimension_2(), sub_N2 );
+    ASSERT_EQ( stride3.size(), sub_N0 * sub_N1 * sub_N2 );
+
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_GE( dyn_off3.span()    , stride3.span() );
+
+    for ( int k = 0; k < sub_N2; ++k )
+    for ( int j = 0; j < sub_N1; ++j )
+    for ( int i = 0; i < sub_N0; ++i )
+    {
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+    }
+  }
+
+  {
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
+
+    constexpr int sub_N0 = 1000;
+    constexpr int sub_N1 = 200;
+    constexpr int sub_N2 = 4;
+
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
+
+    right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
+                           , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) );
+
+    Kokkos::Impl::SubviewExtents< 3, 3 >
+      sub( dyn_off3.m_dim
+         , Kokkos::pair< int, int >( 0, sub_N0 )
+         , Kokkos::pair< int, int >( 0, sub_N1 )
+         , Kokkos::pair< int, int >( 0, sub_N2 )
+         );
+
+    stride_s0_s0_s0  stride3( dyn_off3, sub );
+
+    ASSERT_EQ( stride3.dimension_0(), sub_N0 );
+    ASSERT_EQ( stride3.dimension_1(), sub_N1 );
+    ASSERT_EQ( stride3.dimension_2(), sub_N2 );
+    ASSERT_EQ( stride3.size(), sub_N0 * sub_N1 * sub_N2 );
+
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_GE( dyn_off3.span()    , stride3.span() );
+
+    for ( int i = 0; i < sub_N0; ++i )
+    for ( int j = 0; j < sub_N1; ++j )
+    for ( int k = 0; k < sub_N2; ++k )
+    {
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+    }
+  }
+
+  //----------------------------------------
+  // View data analysis.
+  {
+    using namespace Kokkos::Impl;
+
+    static_assert( rank_dynamic<>::value == 0, "" );
+    static_assert( rank_dynamic< 1 >::value == 0, "" );
+    static_assert( rank_dynamic< 0 >::value == 1, "" );
+    static_assert( rank_dynamic< 0, 1 >::value == 1, "" );
+    static_assert( rank_dynamic< 0, 0, 1 >::value == 2, "" );
+  }
+
+  {
+    using namespace Kokkos::Impl;
+
+    typedef ViewArrayAnalysis< int[] >                 a_int_r1;
+    typedef ViewArrayAnalysis< int**[4][5][6] >        a_int_r5;
+    typedef ViewArrayAnalysis< const int[] >           a_const_int_r1;
+    typedef ViewArrayAnalysis< const int**[4][5][6] >  a_const_int_r5;
+
+    static_assert( a_int_r1::dimension::rank == 1, "" );
+    static_assert( a_int_r1::dimension::rank_dynamic == 1, "" );
+    static_assert( a_int_r5::dimension::ArgN0 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN1 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN2 == 4, "" );
+    static_assert( a_int_r5::dimension::ArgN3 == 5, "" );
+    static_assert( a_int_r5::dimension::ArgN4 == 6, "" );
+    static_assert( a_int_r5::dimension::ArgN5 == 1, "" );
+
+    static_assert( std::is_same< typename a_int_r1::dimension, ViewDimension<0> >::value, "" );
+    static_assert( std::is_same< typename a_int_r1::non_const_value_type, int >::value, "" );
+
+    static_assert( a_const_int_r1::dimension::rank == 1, "" );
+    static_assert( a_const_int_r1::dimension::rank_dynamic == 1, "" );
+    static_assert( std::is_same< typename a_const_int_r1::dimension, ViewDimension<0> >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type, int >::value, "" );
+
+    static_assert( a_const_int_r5::dimension::rank == 5, "" );
+    static_assert( a_const_int_r5::dimension::rank_dynamic == 2, "" );
+
+    static_assert( a_const_int_r5::dimension::ArgN0 == 0, "" );
+    static_assert( a_const_int_r5::dimension::ArgN1 == 0, "" );
+    static_assert( a_const_int_r5::dimension::ArgN2 == 4, "" );
+    static_assert( a_const_int_r5::dimension::ArgN3 == 5, "" );
+    static_assert( a_const_int_r5::dimension::ArgN4 == 6, "" );
+    static_assert( a_const_int_r5::dimension::ArgN5 == 1, "" );
+
+    static_assert( std::is_same< typename a_const_int_r5::dimension, ViewDimension<0, 0, 4, 5, 6> >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r5::non_const_value_type, int >::value, "" );
+
+    static_assert( a_int_r5::dimension::rank == 5, "" );
+    static_assert( a_int_r5::dimension::rank_dynamic == 2, "" );
+    static_assert( std::is_same< typename a_int_r5::dimension, ViewDimension<0, 0, 4, 5, 6> >::value, "" );
+    static_assert( std::is_same< typename a_int_r5::non_const_value_type, int >::value, "" );
+  }
+
+  {
+    using namespace Kokkos::Impl;
+
+    typedef int t_i4[4];
+
+    // Dimensions of t_i4 are appended to the multdimensional array.
+    typedef ViewArrayAnalysis< t_i4 ***[3] > a_int_r5;
+
+    static_assert( a_int_r5::dimension::rank == 5, "" );
+    static_assert( a_int_r5::dimension::rank_dynamic == 3, "" );
+    static_assert( a_int_r5::dimension::ArgN0 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN1 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN2 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN3 == 3, "" );
+    static_assert( a_int_r5::dimension::ArgN4 == 4, "" );
+    static_assert( std::is_same< typename a_int_r5::non_const_value_type, int >::value, "" );
+  }
+
+  {
+    using namespace Kokkos::Impl;
+
+    typedef ViewDataAnalysis< const int[], void >  a_const_int_r1;
+
+    static_assert( std::is_same< typename a_const_int_r1::specialize, void >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::dimension, Kokkos::Impl::ViewDimension<0> >::value, "" );
+
+    static_assert( std::is_same< typename a_const_int_r1::type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::value_type, const int >::value, "" );
+
+    static_assert( std::is_same< typename a_const_int_r1::scalar_array_type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_value_type, const int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_scalar_array_type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_type, int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type, int >::value, "" );
+
+    typedef ViewDataAnalysis< const int**[4], void >  a_const_int_r3;
+
+    static_assert( std::is_same< typename a_const_int_r3::specialize, void >::value, "" );
+
+    static_assert( std::is_same< typename a_const_int_r3::dimension, Kokkos::Impl::ViewDimension<0, 0, 4> >::value, "" );
+
+    static_assert( std::is_same< typename a_const_int_r3::type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::value_type, const int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::scalar_array_type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_value_type, const int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_scalar_array_type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_type, int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_value_type, int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_scalar_array_type, int**[4] >::value, "" );
+
+    // std::cout << "typeid( const int**[4] ).name() = " << typeid( const int**[4] ).name() << std::endl;
+  }
+
+  //----------------------------------------
+
+  {
+    constexpr int N = 10;
+
+    typedef Kokkos::View< int*, Space >        T;
+    typedef Kokkos::View< const int*, Space >  C;
+
+    int data[N];
+
+    T vr1( data, N ); // View of non-const.
+    C cr1( vr1 );     // View of const from view of non-const.
+    C cr2( (const int *) data, N );
+
+    // Generate static_assert error:
+    // T tmp( cr1 );
+
+    ASSERT_EQ( vr1.span(), N );
+    ASSERT_EQ( cr1.span(), N );
+    ASSERT_EQ( vr1.data(), & data[0] );
+    ASSERT_EQ( cr1.data(), & data[0] );
+
+    ASSERT_TRUE( ( std::is_same< typename T::data_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_data_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type, int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type, int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::value_type          , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_value_type    , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type, int >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::memory_space, typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::reference_type, int & >::value ) );
+
+    ASSERT_EQ( T::Rank, 1 );
+
+    ASSERT_TRUE( ( std::is_same< typename C::data_type          , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_data_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_data_type, int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename C::scalar_array_type          , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_scalar_array_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_scalar_array_type, int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename C::value_type          , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_value_type    , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_value_type, int >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename C::memory_space, typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::reference_type, const int & >::value ) );
+
+    ASSERT_EQ( C::Rank, 1 );
+
+    ASSERT_EQ( vr1.extent(0), N );
+
+    if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+      for ( int i = 0; i < N; ++i ) data[i] = i + 1;
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 1 );
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( cr1[i], i + 1 );
+
+      {
+        T tmp( vr1 );
+
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 1 );
+        for ( int i = 0; i < N; ++i ) vr1( i ) = i + 2;
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 2 );
+      }
+
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 2 );
+    }
+  }
+
+  {
+    constexpr int N = 10;
+    typedef Kokkos::View< int*, Space >        T;
+    typedef Kokkos::View< const int*, Space >  C;
+
+    T vr1( "vr1", N );
+    C cr1( vr1 );
+
+    ASSERT_TRUE( ( std::is_same< typename T::data_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_data_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type, int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type, int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::value_type          , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_value_type    , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type, int >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::memory_space, typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::reference_type, int & >::value ) );
+    ASSERT_EQ( T::Rank, 1 );
+
+    ASSERT_EQ( vr1.extent(0), N );
+
+    if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+      for ( int i = 0; i < N; ++i ) vr1( i ) = i + 1;
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 1 );
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( cr1[i], i + 1 );
+
+      {
+        T tmp( vr1 );
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 1 );
+        for ( int i = 0; i < N; ++i ) vr1( i ) = i + 2;
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 2 );
+      }
+
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 2 );
+    }
+  }
+
+  // Testing proper handling of zero-length allocations.
+  {
+    constexpr int N = 0;
+    typedef Kokkos::View< int*, Space >        T;
+    typedef Kokkos::View< const int*, Space >  C;
+
+    T vr1( "vr1", N );
+    C cr1( vr1 );
+
+    ASSERT_EQ( vr1.extent(0), 0 );
+    ASSERT_EQ( cr1.extent(0), 0 );
+  }
+
+  // Testing using space instance for allocation.
+  // The execution space of the memory space must be available for view data initialization.
+  if ( std::is_same< ExecSpace, typename ExecSpace::memory_space::execution_space >::value ) {
+
+    using namespace Kokkos;
+
+    typedef typename ExecSpace::memory_space  memory_space;
+    typedef View< int*, memory_space >        V;
+
+    constexpr int N = 10;
+
+    memory_space mem_space;
+
+    V v( "v", N );
+    V va( view_alloc(), N );
+    V vb( view_alloc( "vb" ), N );
+    V vc( view_alloc( "vc", AllowPadding ), N );
+    V vd( view_alloc( "vd", WithoutInitializing ), N );
+    V ve( view_alloc( "ve", WithoutInitializing, AllowPadding ), N );
+    V vf( view_alloc( "vf", mem_space, WithoutInitializing, AllowPadding ), N );
+    V vg( view_alloc( mem_space, "vg", WithoutInitializing, AllowPadding ), N );
+    V vh( view_alloc( WithoutInitializing, AllowPadding ), N );
+    V vi( view_alloc( WithoutInitializing ), N );
+    V vj( view_alloc( std::string( "vj" ), AllowPadding ), N );
+    V vk( view_alloc( mem_space, std::string( "vk" ), AllowPadding ), N );
+  }
+
+  {
+    typedef Kokkos::ViewTraits< int***, Kokkos::LayoutStride, ExecSpace >           traits_t;
+    typedef Kokkos::Impl::ViewDimension< 0, 0, 0 >                    dims_t;
+    typedef Kokkos::Impl::ViewOffset< dims_t, Kokkos::LayoutStride >  offset_t;
+
+    Kokkos::LayoutStride stride;
+
+    stride.dimension[0] = 3;
+    stride.dimension[1] = 4;
+    stride.dimension[2] = 5;
+    stride.stride[0] = 4;
+    stride.stride[1] = 1;
+    stride.stride[2] = 12;
+
+    const offset_t offset( std::integral_constant< unsigned, 0 >(), stride );
+
+    ASSERT_EQ( offset.dimension_0(), 3 );
+    ASSERT_EQ( offset.dimension_1(), 4 );
+    ASSERT_EQ( offset.dimension_2(), 5 );
+
+    ASSERT_EQ( offset.stride_0(), 4 );
+    ASSERT_EQ( offset.stride_1(), 1 );
+    ASSERT_EQ( offset.stride_2(), 12 );
+
+    ASSERT_EQ( offset.span(), 60 );
+    ASSERT_TRUE( offset.span_is_contiguous() );
+
+    Kokkos::Impl::ViewMapping< traits_t, void >
+      v( Kokkos::Impl::ViewCtorProp< int* >( (int*) 0 ), stride );
+  }
+
+  {
+    typedef Kokkos::View< int**, Space > V;
+    typedef typename V::HostMirror M;
+    typedef typename Kokkos::View< int**, Space >::array_layout layout_type;
+
+    constexpr int N0 = 10;
+    constexpr int N1 = 11;
+
+    V a( "a", N0, N1 );
+    M b = Kokkos::create_mirror( a );
+    M c = Kokkos::create_mirror_view( a );
+    M d;
+
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
+
+    Kokkos::deep_copy( a, b );
+    Kokkos::deep_copy( c, a );
+
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+    }
+
+    Kokkos::resize( b, 5, 6 );
+
+    for ( int i0 = 0; i0 < 5; ++i0 )
+    for ( int i1 = 0; i1 < 6; ++i1 )
+    {
+      int val = 1 + i0 + i1 * N0;
+      ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+      ASSERT_EQ( b( i0, i1 ), val );
+    }
+
+    Kokkos::realloc( c, 5, 6 );
+    Kokkos::realloc( d, 5, 6 );
+
+    ASSERT_EQ( b.extent(0), 5 );
+    ASSERT_EQ( b.extent(1), 6 );
+    ASSERT_EQ( c.extent(0), 5 );
+    ASSERT_EQ( c.extent(1), 6 );
+    ASSERT_EQ( d.extent(0), 5 );
+    ASSERT_EQ( d.extent(1), 6 );
+
+    layout_type layout( 7, 8 );
+    Kokkos::resize( b, layout );
+    for ( int i0 = 0; i0 < 7; ++i0 )
+    for ( int i1 = 6; i1 < 8; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
+
+    for ( int i0 = 5; i0 < 7; ++i0 )
+    for ( int i1 = 0; i1 < 8; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
+
+    for ( int i0 = 0; i0 < 7; ++i0 )
+    for ( int i1 = 0; i1 < 8; ++i1 )
+    {
+       int val = 1 + i0 + i1 * N0;
+       ASSERT_EQ( b( i0, i1 ), val );
+    }
+
+    Kokkos::realloc( c, layout );
+    Kokkos::realloc( d, layout );
+
+    ASSERT_EQ( b.extent(0), 7 );
+    ASSERT_EQ( b.extent(1), 8 );
+    ASSERT_EQ( c.extent(0), 7 );
+    ASSERT_EQ( c.extent(1), 8 );
+    ASSERT_EQ( d.extent(0), 7 );
+    ASSERT_EQ( d.extent(1), 8 );
+  }
+
+  {
+    typedef Kokkos::View< int**, Kokkos::LayoutStride, Space > V;
+    typedef typename V::HostMirror M;
+    typedef typename Kokkos::View< int**, Kokkos::LayoutStride, Space >::array_layout layout_type;
+
+    constexpr int N0 = 10;
+    constexpr int N1 = 11;
+
+    const int dimensions[] = { N0, N1 };
+    const int order[] = { 1, 0 };
+
+    V a( "a", Kokkos::LayoutStride::order_dimensions( 2, order, dimensions ) );
+    M b = Kokkos::create_mirror( a );
+    M c = Kokkos::create_mirror_view( a );
+    M d;
+
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
+
+    Kokkos::deep_copy( a, b );
+    Kokkos::deep_copy( c, a );
+
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+    }
+
+    const int dimensions2[] = { 7, 8 };
+    const int order2[] = { 1, 0 };
+    layout_type layout = layout_type::order_dimensions( 2, order2, dimensions2 );
+    Kokkos::resize( b, layout );
+
+    for ( int i0 = 0; i0 < 7; ++i0 )
+    for ( int i1 = 0; i1 < 8; ++i1 )
+    {
+       int val = 1 + i0 + i1 * N0;
+       ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+       ASSERT_EQ( b( i0, i1 ), val );
+    }
+
+    Kokkos::realloc( c, layout );
+    Kokkos::realloc( d, layout );
+
+    ASSERT_EQ( b.extent(0), 7 );
+    ASSERT_EQ( b.extent(1), 8 );
+    ASSERT_EQ( c.extent(0), 7 );
+    ASSERT_EQ( c.extent(1), 8 );
+    ASSERT_EQ( d.extent(0), 7 );
+    ASSERT_EQ( d.extent(1), 8 );
+
+  }
+
+  {
+    typedef Kokkos::View< int*, Space > V;
+    typedef Kokkos::View< int*, Space, Kokkos::MemoryUnmanaged > U;
+
+    V a( "a", 10 );
+
+    ASSERT_EQ( a.use_count(), 1 );
+
+    V b = a;
+
+    ASSERT_EQ( a.use_count(), 2 );
+    ASSERT_EQ( b.use_count(), 2 );
+
+    {
+      U c = b; // 'c' is compile-time unmanaged.
+
+      ASSERT_EQ( a.use_count(), 2 );
+      ASSERT_EQ( b.use_count(), 2 );
+      ASSERT_EQ( c.use_count(), 2 );
+
+      V d = c; // 'd' is run-time unmanaged.
+
+      ASSERT_EQ( a.use_count(), 2 );
+      ASSERT_EQ( b.use_count(), 2 );
+      ASSERT_EQ( c.use_count(), 2 );
+      ASSERT_EQ( d.use_count(), 2 );
+    }
+
+    ASSERT_EQ( a.use_count(), 2 );
+    ASSERT_EQ( b.use_count(), 2 );
+
+    b = V();
+
+    ASSERT_EQ( a.use_count(), 1 );
+    ASSERT_EQ( b.use_count(), 0 );
+
+#if !defined( KOKKOS_ENABLE_CUDA_LAMBDA ) && !defined( KOKKOS_ENABLE_ROCM )
+    // Cannot launch host lambda when CUDA lambda is enabled.
+
+    typedef typename Kokkos::Impl::HostMirror< Space >::Space::execution_space host_exec_space;
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< host_exec_space >( 0, 10 ), KOKKOS_LAMBDA ( int i ) {
+      // 'a' is captured by copy, and the capture mechanism converts 'a' to an
+      // unmanaged copy.  When the parallel dispatch accepts a move for the
+      // lambda, this count should become 1.
+ 
+      ASSERT_EQ( a.use_count(), 2 );
+      V x = a;
+      ASSERT_EQ( a.use_count(), 2 );
+      ASSERT_EQ( x.use_count(), 2 );
+    });
+#endif // #if !defined( KOKKOS_ENABLE_CUDA_LAMBDA )
+  }
+}
+
+TEST_F( TEST_CATEGORY , view_mapping )
+{
+   test_view_mapping< TEST_EXECSPACE >();
+}
+/*--------------------------------------------------------------------------*/
+
+template< class ViewType >
+struct TestViewMapOperator {
+
+  static_assert( ViewType::reference_type_is_lvalue_reference
+               , "Test only valid for lvalue reference type" );
+
+  const ViewType v;
+
+  KOKKOS_INLINE_FUNCTION
+  void test_left( size_t i0, long & error_count ) const
+  {
+#ifdef KOKKOS_ENABLE_DEPPRECATED_CODE
+    typename ViewType::value_type * const base_ptr = & v( 0, 0, 0, 0, 0, 0, 0, 0 );
+#else
+    typename ViewType::value_type * const base_ptr = & v.access( 0, 0, 0, 0, 0, 0, 0, 0 );
+#endif
+    const size_t n1 = v.extent(1);
+    const size_t n2 = v.extent(2);
+    const size_t n3 = v.extent(3);
+    const size_t n4 = v.extent(4);
+    const size_t n5 = v.extent(5);
+    const size_t n6 = v.extent(6);
+    const size_t n7 = v.extent(7);
+
+    long offset = 0;
+
+    for ( size_t i7 = 0; i7 < n7; ++i7 )
+    for ( size_t i6 = 0; i6 < n6; ++i6 )
+    for ( size_t i5 = 0; i5 < n5; ++i5 )
+    for ( size_t i4 = 0; i4 < n4; ++i4 )
+    for ( size_t i3 = 0; i3 < n3; ++i3 )
+    for ( size_t i2 = 0; i2 < n2; ++i2 )
+    for ( size_t i1 = 0; i1 < n1; ++i1 )
+    {
+#ifdef KOKKOS_ENABLE_DEPREACATED_CODE
+        const long d = & v( i0, i1, i2, i3, i4, i5, i6, i7 ) - base_ptr;
+#else
+        const long d = & v.access( i0, i1, i2, i3, i4, i5, i6, i7 ) - base_ptr;
+#endif
+        if ( d < offset ) ++error_count;
+      offset = d;
+    }
+
+    if ( v.span() <= size_t( offset ) ) ++error_count;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void test_right( size_t i0, long & error_count ) const
+  {
+#ifdef KOKKOS_ENABLE_DEPREACATED_CODE
+    typename ViewType::value_type * const base_ptr = & v( 0, 0, 0, 0, 0, 0, 0, 0 );
+#else
+    typename ViewType::value_type * const base_ptr = & v.access( 0, 0, 0, 0, 0, 0, 0, 0 );
+#endif
+    const size_t n1 = v.extent(1);
+    const size_t n2 = v.extent(2);
+    const size_t n3 = v.extent(3);
+    const size_t n4 = v.extent(4);
+    const size_t n5 = v.extent(5);
+    const size_t n6 = v.extent(6);
+    const size_t n7 = v.extent(7);
+
+    long offset = 0;
+
+    for ( size_t i1 = 0; i1 < n1; ++i1 )
+    for ( size_t i2 = 0; i2 < n2; ++i2 )
+    for ( size_t i3 = 0; i3 < n3; ++i3 )
+    for ( size_t i4 = 0; i4 < n4; ++i4 )
+    for ( size_t i5 = 0; i5 < n5; ++i5 )
+    for ( size_t i6 = 0; i6 < n6; ++i6 )
+    for ( size_t i7 = 0; i7 < n7; ++i7 )
+    {
+#ifdef KOKKOS_ENABLE_DEPREACATED_CODE
+        const long d = & v( i0, i1, i2, i3, i4, i5, i6, i7 ) - base_ptr;
+#else
+        const long d = & v.access( i0, i1, i2, i3, i4, i5, i6, i7 ) - base_ptr;
+#endif
+        if ( d < offset ) ++error_count;
+      offset = d;
+    }
+
+    if ( v.span() <= size_t( offset ) ) ++error_count;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t i, long & error_count ) const
+  {
+    if ( std::is_same< typename ViewType::array_layout, Kokkos::LayoutLeft >::value ) {
+      test_left( i, error_count );
+    }
+    else if ( std::is_same< typename ViewType::array_layout, Kokkos::LayoutRight >::value ) {
+      test_right( i, error_count );
+    }
+  }
+
+  enum { N0 = 10 };
+  enum { N1 = 9 };
+  enum { N2 = 8 };
+  enum { N3 = 7 };
+  enum { N4 = 6 };
+  enum { N5 = 5 };
+  enum { N6 = 4 };
+  enum { N7 = 3 };
+
+  TestViewMapOperator() : v( "Test", N0, N1, N2, N3, N4, N5, N6, N7 ) {}
+
+  void run()
+  {
+    ASSERT_EQ( v.extent(0), ( 0 < ViewType::rank ? TestViewMapOperator<ViewType>::N0 : 1 ) );
+    ASSERT_EQ( v.extent(1), ( 1 < ViewType::rank ? TestViewMapOperator<ViewType>::N1 : 1 ) );
+    ASSERT_EQ( v.extent(2), ( 2 < ViewType::rank ? TestViewMapOperator<ViewType>::N2 : 1 ) );
+    ASSERT_EQ( v.extent(3), ( 3 < ViewType::rank ? TestViewMapOperator<ViewType>::N3 : 1 ) );
+    ASSERT_EQ( v.extent(4), ( 4 < ViewType::rank ? TestViewMapOperator<ViewType>::N4 : 1 ) );
+    ASSERT_EQ( v.extent(5), ( 5 < ViewType::rank ? TestViewMapOperator<ViewType>::N5 : 1 ) );
+    ASSERT_EQ( v.extent(6), ( 6 < ViewType::rank ? TestViewMapOperator<ViewType>::N6 : 1 ) );
+    ASSERT_EQ( v.extent(7), ( 7 < ViewType::rank ? TestViewMapOperator<ViewType>::N7 : 1 ) );
+
+    ASSERT_LE( v.extent(0) *
+               v.extent(1) *
+               v.extent(2) *
+               v.extent(3) *
+               v.extent(4) *
+               v.extent(5) *
+               v.extent(6) *
+               v.extent(7)
+             , v.span() );
+
+    long error_count;
+    Kokkos::RangePolicy< typename ViewType::execution_space > range( 0, v.extent(0) );
+    Kokkos::parallel_reduce( range, *this, error_count );
+    ASSERT_EQ( 0, error_count );
+}
+};
+
+template< class Space >
+void test_view_mapping_operator()
+{
+  typedef typename Space::execution_space ExecSpace;
+
+  { TestViewMapOperator< Kokkos::View<int, Kokkos::LayoutLeft, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int*, Kokkos::LayoutLeft, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int**, Kokkos::LayoutLeft, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int***, Kokkos::LayoutLeft, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int****, Kokkos::LayoutLeft, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int*****, Kokkos::LayoutLeft, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int******, Kokkos::LayoutLeft, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int*******, Kokkos::LayoutLeft, ExecSpace> > f; f.run(); }
+
+  { TestViewMapOperator< Kokkos::View<int, Kokkos::LayoutRight, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int*, Kokkos::LayoutRight, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int**, Kokkos::LayoutRight, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int***, Kokkos::LayoutRight, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int****, Kokkos::LayoutRight, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int*****, Kokkos::LayoutRight, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int******, Kokkos::LayoutRight, ExecSpace> > f; f.run(); }
+  { TestViewMapOperator< Kokkos::View<int*******, Kokkos::LayoutRight, ExecSpace> > f; f.run(); }
+}
+
+TEST_F( TEST_CATEGORY , view_mapping_operator )
+{
+  test_view_mapping_operator< TEST_EXECSPACE >();
+}
+
+}
+
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_b.hpp b/packages/kokkos/core/unit_test/TestViewMapping_b.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c7807f60d5dccf30b0eb90e79015a2a42eccaab
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewMapping_b.hpp
@@ -0,0 +1,260 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+/*--------------------------------------------------------------------------*/
+
+template< class Space >
+struct TestViewMappingAtomic {
+  typedef typename Space::execution_space ExecSpace;
+  typedef typename Space::memory_space    MemSpace;
+
+  typedef Kokkos::MemoryTraits< Kokkos::Atomic >  mem_trait;
+
+  typedef Kokkos::View< int *, ExecSpace > T;
+  typedef Kokkos::View< int *, ExecSpace, mem_trait >  T_atom;
+
+  T      x;
+  T_atom x_atom;
+
+  enum { N = 100000};
+
+  struct TagInit {};
+  struct TagUpdate {};
+  struct TagVerify {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit &, const int i ) const
+  { x( i ) = i; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagUpdate &, const int i ) const
+  { x_atom( i % 2 ) += 1; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagVerify &, const int i, long & error_count ) const
+  {
+     if ( i < 2 ) { if ( x( i ) != int( i + N / 2 ) ) ++error_count; }
+     else         { if ( x( i ) != int( i ) ) ++error_count; }
+  }
+
+  TestViewMappingAtomic()
+    : x( "x", N )
+    , x_atom( x )
+    {}
+
+  void run() {
+
+    ASSERT_TRUE( T::reference_type_is_lvalue_reference );
+    ASSERT_FALSE( T_atom::reference_type_is_lvalue_reference );
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, TagInit >  ( 0, N ), *this );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, TagUpdate >( 0, N ), *this );
+
+    long error_count = -1;
+
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TagVerify >( 0, N ), *this, error_count );
+
+    ASSERT_EQ( 0, error_count );
+
+    typename T_atom::HostMirror x_host = Kokkos::create_mirror_view( x );
+    Kokkos::deep_copy( x_host, x );
+
+    error_count = -1;
+
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::DefaultHostExecutionSpace, TagVerify >( 0, N ), 
+      [=] ( const TagVerify &, const int i, long & tmp_error_count )
+    {
+      if ( i < 2 ) {
+        if ( x_host( i ) != int( i + N / 2 ) ) ++tmp_error_count ;
+      }
+      else {
+        if ( x_host( i ) != int( i ) ) ++tmp_error_count ;
+      }
+    }, error_count);
+
+    ASSERT_EQ( 0 , error_count );
+    Kokkos::deep_copy( x, x_host );
+  }
+};
+
+TEST_F( TEST_CATEGORY , view_mapping_atomic )
+{
+  TestViewMappingAtomic< TEST_EXECSPACE > f;
+  f.run();
+}
+
+}
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+struct MappingClassValueType {
+    KOKKOS_INLINE_FUNCTION
+    MappingClassValueType() 
+    {
+#if 0
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
+      printf( "TestViewMappingClassValue construct on Cuda\n" );
+#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      printf( "TestViewMappingClassValue construct on Host\n" );
+#else
+      printf( "TestViewMappingClassValue construct unknown\n" );
+#endif
+#endif
+    }
+    KOKKOS_INLINE_FUNCTION
+    ~MappingClassValueType()
+    {
+#if 0
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
+      printf( "TestViewMappingClassValue destruct on Cuda\n" );
+#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      printf( "TestViewMappingClassValue destruct on Host\n" );
+#else
+      printf( "TestViewMappingClassValue destruct unknown\n" );
+#endif
+#endif
+    }
+  };
+
+template< class Space >
+void test_view_mapping_class_value()
+{
+  typedef typename Space::execution_space ExecSpace;
+
+  ExecSpace::fence();
+  {
+    Kokkos::View< MappingClassValueType, ExecSpace > a( "a" );
+    ExecSpace::fence();
+  }
+  ExecSpace::fence();
+}
+
+TEST_F( TEST_CATEGORY , view_mapping_class_value )
+{
+  test_view_mapping_class_value< TEST_EXECSPACE >();
+}
+
+}
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY , view_mapping_assignable )
+{
+  typedef TEST_EXECSPACE exec_space ;
+
+  { // Assignment of rank-0 Left = Right
+    typedef Kokkos::ViewTraits<int,Kokkos::LayoutLeft, exec_space> dst_traits ;
+    typedef Kokkos::ViewTraits<int,Kokkos::LayoutRight,exec_space> src_traits ;
+    typedef Kokkos::Impl::ViewMapping<dst_traits,src_traits,void> mapping ;
+    static_assert( mapping::is_assignable , "" );
+
+    Kokkos::View<int,Kokkos::LayoutRight,exec_space> src ;
+    Kokkos::View<int,Kokkos::LayoutLeft,exec_space> dst( src );
+    dst = src ;
+  }
+
+  { // Assignment of rank-0 Right = Left
+    typedef Kokkos::ViewTraits<int,Kokkos::LayoutRight,exec_space> dst_traits ;
+    typedef Kokkos::ViewTraits<int,Kokkos::LayoutLeft, exec_space> src_traits ;
+    typedef Kokkos::Impl::ViewMapping<dst_traits,src_traits,void> mapping ;
+    static_assert( mapping::is_assignable , "" );
+
+    Kokkos::View<int,Kokkos::LayoutLeft,exec_space> src ;
+    Kokkos::View<int,Kokkos::LayoutRight,exec_space> dst( src );
+    dst = src ;
+  }
+
+  { // Assignment of rank-1 Left = Right
+    typedef Kokkos::ViewTraits<int*,Kokkos::LayoutLeft, exec_space> dst_traits ;
+    typedef Kokkos::ViewTraits<int*,Kokkos::LayoutRight,exec_space> src_traits ;
+    typedef Kokkos::Impl::ViewMapping<dst_traits,src_traits,void> mapping ;
+    static_assert( mapping::is_assignable , "" );
+
+    Kokkos::View<int*,Kokkos::LayoutRight,exec_space> src ;
+    Kokkos::View<int*,Kokkos::LayoutLeft,exec_space> dst( src );
+    dst = src ;
+  }
+
+  { // Assignment of rank-1 Right = Left
+    typedef Kokkos::ViewTraits<int*,Kokkos::LayoutRight,exec_space> dst_traits ;
+    typedef Kokkos::ViewTraits<int*,Kokkos::LayoutLeft, exec_space> src_traits ;
+    typedef Kokkos::Impl::ViewMapping<dst_traits,src_traits,void> mapping ;
+    static_assert( mapping::is_assignable , "" );
+
+    Kokkos::View<int*,Kokkos::LayoutLeft,exec_space> src ;
+    Kokkos::View<int*,Kokkos::LayoutRight,exec_space> dst( src );
+    dst = src ;
+  }
+
+  { // Assignment of rank-2 Left = Right
+    typedef Kokkos::ViewTraits<int**,Kokkos::LayoutLeft, exec_space> dst_traits ;
+    typedef Kokkos::ViewTraits<int**,Kokkos::LayoutRight,exec_space> src_traits ;
+    typedef Kokkos::Impl::ViewMapping<dst_traits,src_traits,void> mapping ;
+    static_assert( ! mapping::is_assignable , "" );
+  }
+
+  { // Assignment of rank-2 Right = Left
+    typedef Kokkos::ViewTraits<int**,Kokkos::LayoutRight,exec_space> dst_traits ;
+    typedef Kokkos::ViewTraits<int**,Kokkos::LayoutLeft, exec_space> src_traits ;
+    typedef Kokkos::Impl::ViewMapping<dst_traits,src_traits,void> mapping ;
+    static_assert( ! mapping::is_assignable , "" );
+  }
+
+}
+
+}
+
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2414276161cdb789210fd01947b6cae42f1e2aa3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
@@ -0,0 +1,215 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+template< class Space >
+struct TestViewMappingSubview
+{
+  typedef typename Space::execution_space ExecSpace;
+  typedef typename Space::memory_space    MemSpace;
+
+  typedef Kokkos::pair< int, int > range;
+
+  enum { AN = 10 };
+  typedef Kokkos::View< int*, ExecSpace >  AT;
+  typedef Kokkos::View< const int*, ExecSpace >  ACT;
+  typedef Kokkos::Subview< AT, range >  AS;
+
+  enum { BN0 = 10, BN1 = 11, BN2 = 12 };
+  typedef Kokkos::View< int***, ExecSpace >  BT;
+  typedef Kokkos::Subview< BT, range, range, range >  BS;
+
+  enum { CN0 = 10, CN1 = 11, CN2 = 12 };
+  typedef Kokkos::View< int***[13][14], ExecSpace >  CT;
+  typedef Kokkos::Subview< CT, range, range, range, int, int >  CS;
+
+  enum { DN0 = 10, DN1 = 11, DN2 = 12, DN3 = 13, DN4 = 14 };
+  typedef Kokkos::View< int***[DN3][DN4], ExecSpace >  DT;
+  typedef Kokkos::Subview< DT, int, range, range, range, int >  DS;
+
+  typedef Kokkos::View< int***[13][14], Kokkos::LayoutLeft, ExecSpace >  DLT;
+  typedef Kokkos::Subview< DLT, range, int, int, int, int >  DLS1;
+
+  #if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND)
+  static_assert( DLS1::rank == 1 && std::is_same< typename DLS1::array_layout, Kokkos::LayoutLeft >::value
+               , "Subview layout error for rank 1 subview of left-most range of LayoutLeft" );
+  #endif
+
+  typedef Kokkos::View< int***[13][14], Kokkos::LayoutRight, ExecSpace >  DRT;
+  typedef Kokkos::Subview< DRT, int, int, int, int, range >  DRS1;
+
+  #if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND)
+  static_assert( DRS1::rank == 1 && std::is_same< typename DRS1::array_layout, Kokkos::LayoutRight >::value
+               , "Subview layout error for rank 1 subview of right-most range of LayoutRight" );
+  #endif
+
+  AT Aa;
+  AS Ab;
+  ACT Ac;
+  BT Ba;
+  BS Bb;
+  CT Ca;
+  CS Cb;
+  DT Da;
+  DS Db;
+
+  TestViewMappingSubview()
+    : Aa( "Aa", AN )
+    , Ab( Kokkos::subview( Aa, std::pair< int, int >( 1, AN - 1 ) ) )
+    , Ac( Aa, std::pair< int, int >( 1, AN - 1 ) )
+    , Ba( "Ba", BN0, BN1, BN2 )
+    , Bb( Kokkos::subview( Ba
+                                        , std::pair< int, int >( 1, BN0 - 1 )
+                                        , std::pair< int, int >( 1, BN1 - 1 )
+                                        , std::pair< int, int >( 1, BN2 - 1 )
+                                        ) )
+    , Ca( "Ca", CN0, CN1, CN2 )
+    , Cb( Kokkos::subview( Ca
+                                        , std::pair< int, int >( 1, CN0 - 1 )
+                                        , std::pair< int, int >( 1, CN1 - 1 )
+                                        , std::pair< int, int >( 1, CN2 - 1 )
+                                        , 1
+                                        , 2
+                                        ) )
+    , Da( "Da", DN0, DN1, DN2 )
+    , Db( Kokkos::subview( Da
+                                        , 1
+                                        , std::pair< int, int >( 1, DN1 - 1 )
+                                        , std::pair< int, int >( 1, DN2 - 1 )
+                                        , std::pair< int, int >( 1, DN3 - 1 )
+                                        , 2
+                                        ) )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int, long & error_count ) const
+  {
+    auto Ad = Kokkos::subview< Kokkos::MemoryUnmanaged >( Aa, Kokkos::pair< int, int >( 1, AN - 1 ) );
+
+    for ( int i = 1; i < AN - 1; ++i ) if( & Aa[i] != & Ab[i - 1] ) ++error_count;
+    for ( int i = 1; i < AN - 1; ++i ) if( & Aa[i] != & Ac[i - 1] ) ++error_count;
+    for ( int i = 1; i < AN - 1; ++i ) if( & Aa[i] != & Ad[i - 1] ) ++error_count;
+
+    for ( int i2 = 1; i2 < BN2 - 1; ++i2 )
+    for ( int i1 = 1; i1 < BN1 - 1; ++i1 )
+    for ( int i0 = 1; i0 < BN0 - 1; ++i0 )
+    {
+      if ( & Ba( i0, i1, i2 ) != & Bb( i0 - 1, i1 - 1, i2 - 1 ) ) ++error_count;
+    }
+
+    for ( int i2 = 1; i2 < CN2 - 1; ++i2 )
+    for ( int i1 = 1; i1 < CN1 - 1; ++i1 )
+    for ( int i0 = 1; i0 < CN0 - 1; ++i0 )
+    {
+      if ( & Ca( i0, i1, i2, 1, 2 ) != & Cb( i0 - 1, i1 - 1, i2 - 1 ) ) ++error_count;
+    }
+
+    for ( int i2 = 1; i2 < DN3 - 1; ++i2 )
+    for ( int i1 = 1; i1 < DN2 - 1; ++i1 )
+    for ( int i0 = 1; i0 < DN1 - 1; ++i0 )
+    {
+      if ( & Da( 1, i0, i1, i2, 2 ) != & Db( i0 - 1, i1 - 1, i2 - 1 ) ) ++error_count;
+    }
+  }
+
+  void run()
+  {
+    typedef typename Space::execution_space ExecSpace;
+
+    TestViewMappingSubview< ExecSpace > self;
+
+    ASSERT_EQ( Aa.extent(0), AN );
+    ASSERT_EQ( Ab.extent(0), AN - 2 );
+    ASSERT_EQ( Ac.extent(0), AN - 2 );
+    ASSERT_EQ( Ba.extent(0), BN0 );
+    ASSERT_EQ( Ba.extent(1), BN1 );
+    ASSERT_EQ( Ba.extent(2), BN2 );
+    ASSERT_EQ( Bb.extent(0), BN0 - 2 );
+    ASSERT_EQ( Bb.extent(1), BN1 - 2 );
+    ASSERT_EQ( Bb.extent(2), BN2 - 2 );
+
+    ASSERT_EQ( Ca.extent(0), CN0 );
+    ASSERT_EQ( Ca.extent(1), CN1 );
+    ASSERT_EQ( Ca.extent(2), CN2 );
+    ASSERT_EQ( Ca.extent(3), 13 ); 
+    ASSERT_EQ( Ca.extent(4), 14 );
+    ASSERT_EQ( Cb.extent(0), CN0 - 2 );
+    ASSERT_EQ( Cb.extent(1), CN1 - 2 );
+    ASSERT_EQ( Cb.extent(2), CN2 - 2 );
+
+    ASSERT_EQ( Da.extent(0), DN0 );
+    ASSERT_EQ( Da.extent(1), DN1 );
+    ASSERT_EQ( Da.extent(2), DN2 );
+    ASSERT_EQ( Da.extent(3), DN3 );
+    ASSERT_EQ( Da.extent(4), DN4 );
+
+    ASSERT_EQ( Db.extent(0), DN1 - 2 );
+    ASSERT_EQ( Db.extent(1), DN2 - 2 );
+    ASSERT_EQ( Db.extent(2), DN3 - 2 );
+
+    ASSERT_EQ( Da.stride_1(), Db.stride_0() );
+    ASSERT_EQ( Da.stride_2(), Db.stride_1() );
+    ASSERT_EQ( Da.stride_3(), Db.stride_2() );
+
+    long error_count = -1;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, 1 ), *this, error_count );
+    ASSERT_EQ( error_count, 0 );
+  }
+};
+
+TEST_F( TEST_CATEGORY , view_mapping_subview )
+{
+  TestViewMappingSubview< TEST_EXECSPACE > f;
+  f.run();
+}
+
+}
diff --git a/packages/kokkos/core/unit_test/TestViewOfClass.hpp b/packages/kokkos/core/unit_test/TestViewOfClass.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7198b4c1400a0a9792192ea6b0642d89b456ee17
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewOfClass.hpp
@@ -0,0 +1,126 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+namespace Test {
+
+template< class Space >
+struct NestedView {
+  Kokkos::View< int*, Space > member;
+
+public:
+  KOKKOS_INLINE_FUNCTION
+  NestedView() : member() {}
+
+  KOKKOS_INLINE_FUNCTION
+  NestedView & operator=( const Kokkos::View< int*, Space > & lhs )
+  {
+    member = lhs;
+    if ( member.extent(0) ) Kokkos::atomic_add( & member( 0 ), 1 );
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  ~NestedView()
+  {
+    if ( member.extent(0) ) {
+      Kokkos::atomic_add( & member( 0 ), -1 );
+    }
+  }
+};
+
+template< class Space >
+struct NestedViewFunctor {
+
+  Kokkos::View< NestedView<Space> *, Space > nested;
+  Kokkos::View< int*, Space >                array;
+
+  NestedViewFunctor(
+    const Kokkos::View< NestedView<Space> *, Space > & arg_nested,
+    const Kokkos::View< int*, Space >                & arg_array )
+  : nested( arg_nested )
+  , array(  arg_array )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const { nested[i] = array; }
+};
+
+template< class Space >
+void view_nested_view()
+{
+  Kokkos::View< int*, Space > tracking( "tracking", 1 );
+
+  typename Kokkos::View< int*, Space >::HostMirror host_tracking = Kokkos::create_mirror( tracking );
+
+  {
+    Kokkos::View< NestedView<Space> *, Space > a( "a_nested_view", 2 );
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< Space >( 0, 2 ), NestedViewFunctor< Space >( a, tracking ) );
+    Kokkos::deep_copy( host_tracking, tracking );
+    ASSERT_EQ( 2, host_tracking( 0 ) );
+
+    Kokkos::View< NestedView<Space> *, Space > b( "b_nested_view", 2 );
+    Kokkos::parallel_for( Kokkos::RangePolicy< Space >( 0, 2 ), NestedViewFunctor< Space >( b, tracking ) );
+    Kokkos::deep_copy( host_tracking, tracking );
+    ASSERT_EQ( 4, host_tracking( 0 ) );
+
+  }
+
+  Kokkos::deep_copy( host_tracking, tracking );
+
+  ASSERT_EQ( 0, host_tracking( 0 ) );
+}
+
+TEST_F( TEST_CATEGORY, view_nested_view )
+{
+  view_nested_view< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestViewSpaceAssign.hpp b/packages/kokkos/core/unit_test/TestViewSpaceAssign.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ecec47d7ba179d5434aaa8dd4b2a9512de3c480e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewSpaceAssign.hpp
@@ -0,0 +1,76 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+namespace Test {
+
+template< typename SpaceDst, typename SpaceSrc >
+void view_space_assign()
+{
+  Kokkos::View< double*, SpaceDst > a =
+    Kokkos::View< double*, SpaceSrc >( "a", 1 );
+
+  Kokkos::View< double*, Kokkos::LayoutLeft, SpaceDst > b =
+    Kokkos::View< double*, Kokkos::LayoutLeft, SpaceSrc >( "b", 1 );
+
+  Kokkos::View< double*, Kokkos::LayoutRight, SpaceDst > c =
+    Kokkos::View< double*, Kokkos::LayoutRight, SpaceSrc >( "c", 1 );
+
+  Kokkos::View< double*, SpaceDst, Kokkos::MemoryRandomAccess > d =
+    Kokkos::View< double*, SpaceSrc >( "d", 1 );
+
+  Kokkos::View< double*, Kokkos::LayoutLeft, SpaceDst, Kokkos::MemoryRandomAccess > e =
+    Kokkos::View< double*, Kokkos::LayoutLeft, SpaceSrc >( "e", 1 );
+
+  // Rank-one layout can assign:
+  Kokkos::View< double*, Kokkos::LayoutRight, SpaceDst > f =
+  Kokkos::View< double*, Kokkos::LayoutLeft, SpaceSrc >( "f", 1 );
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestViewSubview.hpp b/packages/kokkos/core/unit_test/TestViewSubview.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..207fbb148d14ba720d3f77d4d9d6718b424a355b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestViewSubview.hpp
@@ -0,0 +1,1334 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef TESTVIEWSUBVIEW_HPP_
+#define TESTVIEWSUBVIEW_HPP_
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+namespace TestViewSubview {
+
+template< class Layout, class Space >
+struct getView {
+  static
+    Kokkos::View< double**, Layout, Space > get( int n, int m ) {
+      return Kokkos::View< double**, Layout, Space >( "G", n, m );
+  }
+};
+
+template< class Space >
+struct getView< Kokkos::LayoutStride, Space > {
+  static
+    Kokkos::View< double**, Kokkos::LayoutStride, Space > get( int n, int m ) {
+      const int rank = 2;
+      const int order[] = { 0, 1 };
+      const unsigned dim[] = { unsigned( n ), unsigned( m ) };
+      Kokkos::LayoutStride stride = Kokkos::LayoutStride::order_dimensions( rank, order, dim );
+
+      return Kokkos::View< double**, Kokkos::LayoutStride, Space >( "G", stride );
+  }
+};
+
+template< class ViewType, class Space >
+struct fill_1D {
+  typedef typename Space::execution_space execution_space;
+  typedef typename ViewType::size_type size_type;
+
+  ViewType a;
+  double val;
+
+  fill_1D( ViewType a_, double val_ ) : a( a_ ), val( val_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i ) const { a( i ) = val; }
+};
+
+template< class ViewType, class Space >
+struct fill_2D {
+  typedef typename Space::execution_space execution_space;
+  typedef typename ViewType::size_type size_type;
+
+  ViewType a;
+  double val;
+
+  fill_2D( ViewType a_, double val_ ) : a( a_ ), val( val_ ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i ) const
+  {
+    for ( int j = 0; j < static_cast< int >( a.extent(1) ); j++ ) {
+      a( i, j ) = val;
+    }
+  }
+};
+
+template< class Layout, class Space >
+void test_auto_1d ()
+{
+  typedef Kokkos::View< double**, Layout, Space > mv_type;
+  typedef typename mv_type::size_type size_type;
+
+  const double ZERO = 0.0;
+  const double ONE = 1.0;
+  const double TWO = 2.0;
+
+  const size_type numRows = 10;
+  const size_type numCols = 3;
+
+  mv_type X = getView< Layout, Space >::get( numRows, numCols );
+  typename mv_type::HostMirror X_h = Kokkos::create_mirror_view( X );
+
+  fill_2D< mv_type, Space > f1( X, ONE );
+  Kokkos::parallel_for( X.extent(0), f1 );
+  Kokkos::fence();
+  Kokkos::deep_copy( X_h, X );
+  for ( size_type j = 0; j < numCols; ++j ) {
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == ONE );
+    }
+  }
+
+  fill_2D< mv_type, Space > f2( X, 0.0 );
+  Kokkos::parallel_for( X.extent(0), f2 );
+  Kokkos::fence();
+  Kokkos::deep_copy( X_h, X );
+  for ( size_type j = 0; j < numCols; ++j ) {
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == ZERO );
+    }
+  }
+
+  fill_2D< mv_type, Space > f3( X, TWO );
+  Kokkos::parallel_for( X.extent(0), f3 );
+  Kokkos::fence();
+  Kokkos::deep_copy( X_h, X );
+  for ( size_type j = 0; j < numCols; ++j ) {
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == TWO );
+    }
+  }
+
+  for ( size_type j = 0; j < numCols; ++j ) {
+    auto X_j = Kokkos::subview( X, Kokkos::ALL, j );
+
+    fill_1D< decltype( X_j ), Space > f4( X_j, ZERO );
+    Kokkos::parallel_for( X_j.extent(0), f4 );
+    Kokkos::fence();
+    Kokkos::deep_copy( X_h, X );
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == ZERO );
+    }
+
+    for ( size_type jj = 0; jj < numCols; ++jj ) {
+      auto X_jj = Kokkos::subview ( X, Kokkos::ALL, jj );
+      fill_1D< decltype( X_jj ), Space > f5( X_jj, ONE );
+      Kokkos::parallel_for( X_jj.extent(0), f5 );
+      Kokkos::fence();
+      Kokkos::deep_copy( X_h, X );
+      for ( size_type i = 0; i < numRows; ++i ) {
+        ASSERT_TRUE( X_h( i, jj ) == ONE );
+      }
+    }
+  }
+}
+
+template< class LD, class LS, class Space >
+void test_1d_strided_assignment_impl( bool a, bool b, bool c, bool d, int n, int m ) {
+  Kokkos::View< double**, LS, Space > l2d( "l2d", n, m );
+
+  int col = n > 2 ? 2 : 0;
+  int row = m > 2 ? 2 : 0;
+
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    if ( a ) {
+      Kokkos::View< double*, LD, Space > l1da = Kokkos::subview( l2d, Kokkos::ALL, row );
+      ASSERT_TRUE( & l1da( 0 ) == & l2d( 0, row ) );
+      if ( n > 1 ) {
+        ASSERT_TRUE( & l1da( 1 ) == & l2d( 1, row ) );
+      }
+    }
+
+    if ( b && n > 13 ) {
+      Kokkos::View< double*, LD, Space > l1db = Kokkos::subview( l2d, std::pair< unsigned, unsigned >( 2, 13 ), row );
+      ASSERT_TRUE( & l1db( 0 ) == & l2d( 2, row ) );
+      ASSERT_TRUE( & l1db( 1 ) == & l2d( 3, row ) );
+    }
+
+    if ( c ) {
+      Kokkos::View< double*, LD, Space > l1dc = Kokkos::subview( l2d, col, Kokkos::ALL );
+      ASSERT_TRUE( & l1dc( 0 ) == & l2d( col, 0 ) );
+      if( m > 1 ) {
+        ASSERT_TRUE( & l1dc( 1 ) == & l2d( col, 1 ) );
+      }
+    }
+
+    if ( d && m > 13 ) {
+      Kokkos::View< double*, LD, Space > l1dd = Kokkos::subview( l2d, col, std::pair< unsigned, unsigned >( 2, 13 ) );
+      ASSERT_TRUE( & l1dd( 0 ) == & l2d( col, 2 ) );
+      ASSERT_TRUE( & l1dd( 1 ) == & l2d( col, 3 ) );
+    }
+  }
+
+}
+
+template< class Space >
+void test_1d_strided_assignment() {
+  test_1d_strided_assignment_impl< Kokkos::LayoutStride, Kokkos::LayoutLeft, Space >( true, true, true, true, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutStride, Kokkos::LayoutRight, Space >( true, true, true, true, 17, 3 );
+
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutRight, Space >( false, false, true, true, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutRight, Space >( false, false, true, true, 17, 3 );
+
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 1 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutLeft, Space >( true, true, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutLeft, Space >( true, true, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 1 );
+
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutRight, Space >( true, true, true, true, 17, 1 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutRight, Space >( false, false, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutRight, Space >( false, false, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutRight, Space >( true, true, true, true, 17, 1 );
+}
+
+template< class Space >
+void test_left_0()
+{
+  typedef Kokkos::View< int [2][3][4][5][2][3][4][5], Kokkos::LayoutLeft, Space > view_static_8_type;
+
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_static_8_type x_static_8( "x_static_left_8" );
+
+    ASSERT_TRUE( x_static_8.span_is_contiguous() );
+
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( x_static_8, 0, 0, 0, 0, 0, 0, 0, 0 );
+
+    ASSERT_TRUE( x0.span_is_contiguous() );
+    ASSERT_TRUE( & x0() == & x_static_8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
+
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 =
+      Kokkos::subview( x_static_8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3, 0, 1, 2, 3 );
+
+    ASSERT_TRUE( x1.span_is_contiguous() );
+    ASSERT_TRUE( & x1( 0 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( x_static_8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3
+                                 , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
+
+    ASSERT_TRUE( ! x2.span_is_contiguous() );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x_static_8( 0, 1, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x_static_8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
+
+    // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x_static_8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                                    , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
+
+    ASSERT_TRUE( ! sx2.span_is_contiguous() );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x_static_8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x_static_8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x_static_8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
+
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x_static_8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                                 , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
+
+    ASSERT_TRUE( ! sx4.span_is_contiguous() );
+
+    for ( int i0 = 0; i0 < (int) sx4.extent(0); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.extent(1); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.extent(2); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.extent(3); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x_static_8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
+  }
+}
+
+template< class Space >
+void test_left_1()
+{
+  typedef Kokkos::View< int ****[2][3][4][5], Kokkos::LayoutLeft, Space > view_type;
+
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_type x8( "x_left_8", 2, 3, 4, 5 );
+
+    ASSERT_TRUE( x8.span_is_contiguous() );
+
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( x8, 0, 0, 0, 0, 0, 0, 0, 0 );
+
+    ASSERT_TRUE( x0.span_is_contiguous() );
+    ASSERT_TRUE( & x0() == & x8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
+
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 =
+      Kokkos::subview( x8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3, 0, 1, 2, 3 );
+
+    ASSERT_TRUE( x1.span_is_contiguous() );
+    ASSERT_TRUE( & x1( 0 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( x8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3
+                         , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
+
+    ASSERT_TRUE( ! x2.span_is_contiguous() );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x8( 0, 1, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
+
+    // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                            , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
+
+    ASSERT_TRUE( ! sx2.span_is_contiguous() );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
+
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                         , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
+
+    ASSERT_TRUE( ! sx4.span_is_contiguous() );
+
+    for ( int i0 = 0; i0 < (int) sx4.extent(0); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.extent(1); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.extent(2); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.extent(3); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
+  }
+}
+
+template< class Space >
+void test_left_2()
+{
+  typedef Kokkos::View< int ****, Kokkos::LayoutLeft, Space > view_type;
+
+  if ( Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, typename Space::memory_space>::accessible ) {
+    view_type x4( "x4", 2, 3, 4, 5 );
+
+    ASSERT_TRUE( x4.span_is_contiguous() );
+
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( x4, 0, 0, 0, 0 );
+
+    ASSERT_TRUE( x0.span_is_contiguous() );
+    ASSERT_TRUE( & x0() == & x4( 0, 0, 0, 0 ) );
+
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 =
+      Kokkos::subview( x4, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
+
+    ASSERT_TRUE( x1.span_is_contiguous() );
+    ASSERT_TRUE( & x1( 0 ) == & x4( 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x4( 1, 1, 2, 3 ) );
+
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( x4, Kokkos::pair< int, int >( 0, 2 ), 1
+                         , Kokkos::pair< int, int >( 1, 3 ), 2 );
+
+    ASSERT_TRUE( ! x2.span_is_contiguous() );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x4( 0, 1, 1, 2 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x4( 1, 1, 1, 2 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x4( 0, 1, 2, 2 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x4( 1, 1, 2, 2 ) );
+
+    // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x4, 1, Kokkos::pair< int, int >( 0, 2 )
+                         , 2, Kokkos::pair< int, int >( 1, 4 ) );
+
+    ASSERT_TRUE( ! sx2.span_is_contiguous() );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x4( 1, 0, 2, 1 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x4( 1, 1, 2, 1 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x4( 1, 0, 2, 2 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x4( 1, 1, 2, 2 ) );
+    ASSERT_TRUE( & sx2( 0, 2 ) == & x4( 1, 0, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 2 ) == & x4( 1, 1, 2, 3 ) );
+
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x4, Kokkos::pair< int, int >( 1, 2 ) /* of [2] */
+                         , Kokkos::pair< int, int >( 1, 3 ) /* of [3] */
+                         , Kokkos::pair< int, int >( 0, 4 ) /* of [4] */
+                         , Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
+
+    ASSERT_TRUE( ! sx4.span_is_contiguous() );
+
+    for ( int i0 = 0; i0 < (int) sx4.extent(0); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.extent(1); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.extent(2); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.extent(3); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x4( 1 + i0, 1 + i1, 0 + i2, 2 + i3 ) );
+    }
+  }
+}
+
+template< class Space >
+void test_left_3()
+{
+  typedef Kokkos::View< int **, Kokkos::LayoutLeft, Space > view_type;
+
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_type xm( "x4", 10, 5 );
+
+    ASSERT_TRUE( xm.span_is_contiguous() );
+
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( xm, 5, 3 );
+
+    ASSERT_TRUE( x0.span_is_contiguous() );
+    ASSERT_TRUE( & x0() == & xm( 5, 3 ) );
+
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 = Kokkos::subview( xm, Kokkos::ALL, 3 );
+
+    ASSERT_TRUE( x1.span_is_contiguous() );
+    for ( int i = 0; i < int( xm.extent(0) ); ++i ) {
+      ASSERT_TRUE( & x1( i ) == & xm( i, 3 ) );
+    }
+
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( xm, Kokkos::pair< int, int >( 1, 9 ), Kokkos::ALL );
+
+    ASSERT_TRUE( ! x2.span_is_contiguous() );
+    for ( int j = 0; j < int( x2.extent(1) ); ++j )
+    for ( int i = 0; i < int( x2.extent(0) ); ++i )
+    {
+      ASSERT_TRUE( & x2( i, j ) == & xm( 1 + i, j ) );
+    }
+
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2c =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 2, 4 ) );
+
+    ASSERT_TRUE( x2c.span_is_contiguous() );
+    for ( int j = 0; j < int( x2c.extent(1) ); ++j )
+    for ( int i = 0; i < int( x2c.extent(0) ); ++i )
+    {
+      ASSERT_TRUE( & x2c( i, j ) == & xm( i, 2 + j ) );
+    }
+
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2_n1 =
+      Kokkos::subview( xm, std::pair< int, int >( 1, 1 ), Kokkos::ALL );
+
+    ASSERT_TRUE( x2_n1.extent(0) == 0 );
+    ASSERT_TRUE( x2_n1.extent(1) == xm.extent(1) );
+
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2_n2 =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 1, 1 ) );
+
+    ASSERT_TRUE( x2_n2.extent(0) == xm.extent(0) );
+    ASSERT_TRUE( x2_n2.extent(1) == 0 );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class Space >
+void test_right_0()
+{
+  typedef Kokkos::View< int [2][3][4][5][2][3][4][5], Kokkos::LayoutRight, Space > view_static_8_type;
+
+  if ( Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, typename Space::memory_space>::accessible ) {
+    view_static_8_type x_static_8( "x_static_right_8" );
+
+    Kokkos::View< int, Kokkos::LayoutRight, Space > x0 = Kokkos::subview( x_static_8, 0, 0, 0, 0, 0, 0, 0, 0 );
+
+    ASSERT_TRUE( & x0() == & x_static_8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
+
+    Kokkos::View< int*, Kokkos::LayoutRight, Space > x1 =
+      Kokkos::subview( x_static_8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
+
+    ASSERT_TRUE( x1.extent(0) == 2 );
+    ASSERT_TRUE( & x1( 0 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 2 ) );
+
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2 =
+      Kokkos::subview( x_static_8, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 )
+                                 , 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
+
+    ASSERT_TRUE( x2.extent(0) == 2 );
+    ASSERT_TRUE( x2.extent(1) == 2 );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x_static_8( 0, 1, 2, 1, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x_static_8( 0, 1, 2, 2, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x_static_8( 0, 1, 2, 1, 0, 1, 2, 2 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x_static_8( 0, 1, 2, 2, 0, 1, 2, 2 ) );
+
+    // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x_static_8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                                    , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
+
+    ASSERT_TRUE( sx2.extent(0) == 2 );
+    ASSERT_TRUE( sx2.extent(1) == 2 );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x_static_8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x_static_8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x_static_8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
+
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x_static_8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                                 , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
+
+    ASSERT_TRUE( sx4.extent(0) == 2 );
+    ASSERT_TRUE( sx4.extent(1) == 2 );
+    ASSERT_TRUE( sx4.extent(2) == 2 );
+    ASSERT_TRUE( sx4.extent(3) == 2 );
+    for ( int i0 = 0; i0 < (int) sx4.extent(0); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.extent(1); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.extent(2); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.extent(3); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x_static_8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
+  }
+}
+
+template< class Space >
+void test_right_1()
+{
+  typedef Kokkos::View< int ****[2][3][4][5], Kokkos::LayoutRight, Space > view_type;
+
+  if ( Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, typename Space::memory_space>::accessible ) {
+    view_type x8( "x_right_8", 2, 3, 4, 5 );
+
+    Kokkos::View< int, Kokkos::LayoutRight, Space > x0 = Kokkos::subview( x8, 0, 0, 0, 0, 0, 0, 0, 0 );
+
+    ASSERT_TRUE( & x0() == & x8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
+
+    Kokkos::View< int*, Kokkos::LayoutRight, Space > x1 =
+      Kokkos::subview( x8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
+
+    ASSERT_TRUE( & x1( 0 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 2 ) );
+
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2 =
+      Kokkos::subview( x8, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 )
+                         , 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
+
+    ASSERT_TRUE( & x2( 0, 0 ) == & x8( 0, 1, 2, 1, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x8( 0, 1, 2, 2, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x8( 0, 1, 2, 1, 0, 1, 2, 2 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x8( 0, 1, 2, 2, 0, 1, 2, 2 ) );
+
+    // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                            , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
+
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
+
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                         , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
+
+    for ( int i0 = 0; i0 < (int) sx4.extent(0); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.extent(1); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.extent(2); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.extent(3); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
+  }
+}
+
+template< class Space >
+void test_right_3()
+{
+  typedef Kokkos::View< int **, Kokkos::LayoutRight, Space > view_type;
+
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_type xm( "x4", 10, 5 );
+
+    ASSERT_TRUE( xm.span_is_contiguous() );
+
+    Kokkos::View< int, Kokkos::LayoutRight, Space > x0 = Kokkos::subview( xm, 5, 3 );
+
+    ASSERT_TRUE( x0.span_is_contiguous() );
+    ASSERT_TRUE( & x0() == & xm( 5, 3 ) );
+
+    Kokkos::View< int*, Kokkos::LayoutRight, Space > x1 = Kokkos::subview( xm, 3, Kokkos::ALL );
+
+    ASSERT_TRUE( x1.span_is_contiguous() );
+    for ( int i = 0; i < int( xm.extent(1) ); ++i ) {
+      ASSERT_TRUE( & x1( i ) == & xm( 3, i ) );
+    }
+
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2c =
+      Kokkos::subview( xm, Kokkos::pair< int, int >( 1, 9 ), Kokkos::ALL );
+
+    ASSERT_TRUE( x2c.span_is_contiguous() );
+    for ( int j = 0; j < int( x2c.extent(1) ); ++j )
+    for ( int i = 0; i < int( x2c.extent(0) ); ++i ) {
+      ASSERT_TRUE( & x2c( i, j ) == & xm( 1 + i, j ) );
+    }
+
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2 =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 2, 4 ) );
+
+    ASSERT_TRUE( ! x2.span_is_contiguous() );
+    for ( int j = 0; j < int( x2.extent(1) ); ++j )
+    for ( int i = 0; i < int( x2.extent(0) ); ++i )
+    {
+      ASSERT_TRUE( & x2( i, j ) == & xm( i, 2 + j ) );
+    }
+
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2_n1 =
+      Kokkos::subview( xm, std::pair< int, int >( 1, 1 ), Kokkos::ALL );
+
+    ASSERT_TRUE( x2_n1.extent(0) == 0 );
+    ASSERT_TRUE( x2_n1.extent(1) == xm.extent(1) );
+
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2_n2 =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 1, 1 ) );
+
+    ASSERT_TRUE( x2_n2.extent(0) == xm.extent(0) );
+    ASSERT_TRUE( x2_n2.extent(1) == 0 );
+  }
+}
+
+namespace Impl {
+
+constexpr int N0 = 113;
+constexpr int N1 = 11;
+constexpr int N2 = 17;
+constexpr int N3 = 5;
+constexpr int N4 = 7;
+
+template< class SubView, class View >
+void test_Check1D( SubView a, View b, std::pair< int, int > range ) {
+  int errors = 0;
+
+  for ( int i = 0; i < range.second - range.first; i++ ) {
+    if ( a( i ) != b( i + range.first ) ) errors++;
+  }
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check1D: " << errors << std::endl;
+  }
+
+  ASSERT_TRUE( errors == 0 );
+}
+
+template< class SubView, class View >
+void test_Check1D2D( SubView a, View b, int i0, std::pair< int, int > range ) {
+  int errors = 0;
+
+  for ( int i1 = 0; i1 < range.second - range.first; i1++ ) {
+    if ( a( i1 ) != b( i0, i1 + range.first ) ) errors++;
+  }
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check1D2D: " << errors << std::endl;
+  }
+
+  ASSERT_TRUE( errors == 0 );
+}
+
+template< class SubView, class View >
+void test_Check2D3D( SubView a, View b, int i0, std::pair< int, int > range1
+                   , std::pair< int, int > range2 )
+{
+  int errors = 0;
+
+  for ( int i1 = 0; i1 < range1.second - range1.first; i1++ ) {
+    for ( int i2 = 0; i2 < range2.second - range2.first; i2++ ) {
+      if ( a( i1, i2 ) != b( i0, i1 + range1.first, i2 + range2.first ) ) errors++;
+    }
+  }
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check2D3D: " << errors << std::endl;
+  }
+
+  ASSERT_TRUE( errors == 0 );
+}
+
+template<class SubView, class View>
+void test_Check3D5D( SubView a, View b, int i0, int i1, std::pair< int, int > range2
+                   , std::pair< int, int > range3, std::pair< int, int > range4 )
+{
+  int errors = 0;
+
+  for ( int i2 = 0; i2 < range2.second - range2.first; i2++ ) {
+    for ( int i3 = 0; i3 < range3.second - range3.first; i3++ ) {
+      for ( int i4 = 0; i4 < range4.second - range4.first; i4++ ) {
+        if ( a( i2, i3, i4 ) != b( i0, i1, i2 + range2.first, i3 + range3.first, i4 + range4.first ) ) {
+          errors++;
+        }
+      }
+    }
+  }
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check3D5D: " << errors << std::endl;
+  }
+
+  ASSERT_TRUE( errors == 0 );
+}
+
+template< class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
+void test_1d_assign_impl() {
+  { // Breaks.
+    Kokkos::View< int*, LayoutOrg, Space > a_org( "A", N0 );
+    Kokkos::View< int*, LayoutOrg, Space, MemTraits > a( a_org );
+    Kokkos::fence();
+    for ( int i = 0; i < N0; i++ ) a_org( i ) = i;
+
+    Kokkos::View< int[N0], Layout, Space, MemTraits > a1( a );
+    Kokkos::fence();
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
+
+    Kokkos::View< int[N0], LayoutSub, Space, MemTraits > a2( a1 );
+    Kokkos::fence();
+    test_Check1D( a2, a, std::pair< int, int >( 0, N0 ) );
+    a1 = a;
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
+
+    // Runtime Fail expected.
+    //Kokkos::View< int[N1] > afail1( a );
+
+    // Compile Time Fail expected.
+    //Kokkos::View< int[N1] > afail2( a1 );
+  }
+
+  { // Works.
+    Kokkos::View< int[N0], LayoutOrg, Space, MemTraits > a( "A" );
+    Kokkos::View< int*, Layout, Space, MemTraits > a1( a );
+    Kokkos::fence();
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
+    a1 = a;
+    Kokkos::fence();
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
+  }
+}
+
+template< class Space, class Type, class TypeSub, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
+void test_2d_subview_3d_impl_type() {
+  Kokkos::View< int***, LayoutOrg, Space > a_org( "A", N0, N1, N2 );
+  Kokkos::View< Type, Layout, Space, MemTraits > a( a_org );
+
+  for ( int i0 = 0; i0 < N0; i0++ )
+  for ( int i1 = 0; i1 < N1; i1++ )
+  for ( int i2 = 0; i2 < N2; i2++ )
+  {
+    a_org( i0, i1, i2 ) = i0 * 1000000 + i1 * 1000 + i2;
+  }
+
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a1;
+  a1 = Kokkos::subview( a, 3, Kokkos::ALL, Kokkos::ALL );
+  Kokkos::fence();
+  test_Check2D3D( a1, a, 3, std::pair< int, int >( 0, N1 ), std::pair< int, int >( 0, N2 ) );
+
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a2( a, 3, Kokkos::ALL, Kokkos::ALL );
+  Kokkos::fence();
+  test_Check2D3D( a2, a, 3, std::pair< int, int >( 0, N1 ), std::pair< int, int >( 0, N2 ) );
+}
+
+template< class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
+void test_2d_subview_3d_impl_layout() {
+  test_2d_subview_3d_impl_type< Space, int[N0][N1][N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int[N0][N1][N2], int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int[N0][N1][N2], int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_2d_subview_3d_impl_type< Space, int*   [N1][N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int*   [N1][N2], int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int*   [N1][N2], int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_2d_subview_3d_impl_type< Space, int**      [N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int**      [N2], int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int**      [N2], int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_2d_subview_3d_impl_type< Space, int***         , int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int***         , int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int***         , int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_2d_subview_3d_impl_type< Space, const int[N0][N1][N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int[N0][N1][N2], const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int[N0][N1][N2], const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_2d_subview_3d_impl_type< Space, const int*   [N1][N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int*   [N1][N2], const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int*   [N1][N2], const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_2d_subview_3d_impl_type< Space, const int**      [N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int**      [N2], const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int**      [N2], const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_2d_subview_3d_impl_type< Space, const int***         , const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int***         , const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int***         , const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
+}
+
+template< class Space, class Type, class TypeSub, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
+void test_3d_subview_5d_impl_type() {
+  Kokkos::View< int*****, LayoutOrg, Space > a_org( "A", N0, N1, N2, N3, N4 );
+  Kokkos::View< Type, Layout, Space, MemTraits > a( a_org );
+
+  for ( int i0 = 0; i0 < N0; i0++ )
+  for ( int i1 = 0; i1 < N1; i1++ )
+  for ( int i2 = 0; i2 < N2; i2++ )
+  for ( int i3 = 0; i3 < N3; i3++ )
+  for ( int i4 = 0; i4 < N4; i4++ )
+  {
+    a_org( i0, i1, i2, i3, i4 ) = i0 * 1000000 + i1 * 10000 + i2 * 100 + i3 * 10 + i4;
+  }
+
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a1;
+  a1 = Kokkos::subview( a, 3, 5, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL );
+  Kokkos::fence();
+  test_Check3D5D( a1, a, 3, 5, std::pair< int, int >( 0, N2 ), std::pair< int, int >( 0, N3 ), std::pair< int, int >( 0, N4 ) );
+
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a2( a, 3, 5, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL );
+  Kokkos::fence();
+  test_Check3D5D( a2, a, 3, 5, std::pair< int, int >( 0, N2 ), std::pair< int, int >( 0, N3 ), std::pair< int, int >( 0, N4 ) );
+}
+
+template< class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
+void test_3d_subview_5d_impl_layout() {
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int*****               , int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*****               , int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*****               , int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*****               , int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+}
+
+inline
+void test_subview_legal_args_right() {
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+}
+
+inline
+void test_subview_legal_args_left() {
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, (  Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+}
+
+} // namespace Impl
+
+template< class Space, class MemTraits = void >
+void test_1d_assign() {
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, MemTraits >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutRight, Kokkos::LayoutLeft, Kokkos::LayoutLeft >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutStride, Kokkos::LayoutLeft, Kokkos::LayoutLeft, MemTraits >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutLeft, Kokkos::LayoutRight, Kokkos::LayoutLeft >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutLeft, Kokkos::LayoutStride, Kokkos::LayoutLeft >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutRight, Kokkos::LayoutStride, Kokkos::LayoutLeft >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutLeft, MemTraits >();
+}
+
+template< class Space, class MemTraits = void >
+void test_2d_subview_3d() {
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutLeft,  MemTraits >();
+}
+
+template< class Space, class MemTraits = void >
+void test_3d_subview_5d_right() {
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutRight, MemTraits >();
+}
+
+template< class Space, class MemTraits = void >
+void test_3d_subview_5d_left() {
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  MemTraits >();
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutLeft,  MemTraits >();
+}
+
+namespace Impl {
+
+template< class Layout, class Space >
+struct FillView_3D {
+  Kokkos::View< int***, Layout, Space > a;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const
+  {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ii % a.extent(0)
+                : ii / ( a.extent(1) * a.extent(2) );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ( ii / a.extent(0) ) % a.extent(1)
+                : ( ii / a.extent(2) ) % a.extent(1);
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( a.extent(0) * a.extent(1) )
+                : ii % a.extent(2);
+
+    a( i, j, k ) = 1000000 * i + 1000 * j + k;
+  }
+};
+
+template< class Layout, class Space >
+struct FillView_4D {
+  Kokkos::View< int****, Layout, Space > a;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+              ? ii % a.extent(0)
+              : ii / ( a.extent(1) * a.extent(2) * a.extent(3) );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+              ? ( ii / a.extent(0) ) % a.extent(1)
+              : ( ii / ( a.extent(2) * a.extent(3) ) % a.extent(1) );
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+              ? ( ii / ( a.extent(0) * a.extent(1) ) ) % a.extent(2)
+              : ( ii / a.extent(3) ) % a.extent(2);
+
+    const int l = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( a.extent(0) * a.extent(1) * a.extent(2) )
+                : ii % a.extent(3);
+
+    a( i, j, k, l ) = 1000000 * i + 10000 * j + 100 * k + l;
+  }
+};
+
+template< class Layout, class Space, class MemTraits >
+struct CheckSubviewCorrectness_3D_3D {
+  Kokkos::View< const int***, Layout, Space, MemTraits > a;
+  Kokkos::View< const int***, Layout, Space, MemTraits > b;
+  int offset_0, offset_2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const
+  {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ii % b.extent(0)
+                : ii / ( b.extent(1) * b.extent(2) );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ( ii / b.extent(0) ) % b.extent(1)
+                : ( ii / b.extent(2) ) % b.extent(1);
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( b.extent(0) * b.extent(1) )
+                : ii % b.extent(2);
+
+    if ( a( i + offset_0, j, k + offset_2 ) != b( i, j, k ) ) {
+      Kokkos::abort( "Error: check_subview_correctness 3D-3D (LayoutLeft -> LayoutLeft or LayoutRight -> LayoutRight)" );
+    }
+  }
+};
+
+template< class Layout, class Space, class MemTraits >
+struct CheckSubviewCorrectness_3D_4D {
+  Kokkos::View< const int****, Layout, Space, MemTraits > a;
+  Kokkos::View< const int***, Layout, Space, MemTraits > b;
+  int offset_0, offset_2, index;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ii % b.extent(0)
+                : ii / ( b.extent(1) * b.extent(2) );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ( ii / b.extent(0) ) % b.extent(1)
+                : ( ii / b.extent(2) ) % b.extent(1);
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( b.extent(0) * b.extent(1) )
+                : ii % b.extent(2);
+
+    int i0, i1, i2, i3;
+
+    if ( std::is_same< Layout, Kokkos::LayoutLeft >::value ) {
+      i0 = i + offset_0;
+      i1 = j;
+      i2 = k + offset_2;
+      i3 = index;
+    }
+    else {
+      i0 = index;
+      i1 = i + offset_0;
+      i2 = j;
+      i3 = k + offset_2;
+    }
+
+    if ( a( i0, i1, i2, i3 ) != b( i, j, k ) ) {
+      Kokkos::abort( "Error: check_subview_correctness 3D-4D (LayoutLeft -> LayoutLeft or LayoutRight -> LayoutRight)" );
+    }
+  }
+};
+
+} // namespace Impl
+
+template< class Space, class MemTraits = void >
+void test_layoutleft_to_layoutleft() {
+  Impl::test_subview_legal_args_left();
+
+  {
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > a( "A", 100, 4, 3 );
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::ALL );
+
+    Impl::FillView_3D< Kokkos::LayoutLeft, Space > fill;
+    fill.a = a;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) ), fill );
+
+    Impl::CheckSubviewCorrectness_3D_3D< Kokkos::LayoutLeft, Space, MemTraits > check;
+    check.a = a;
+    check.b = b;
+    check.offset_0 = 16;
+    check.offset_2 = 0;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
+  }
+
+  {
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > a( "A", 100, 4, 5 );
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::pair< int, int >( 1, 3 ) );
+
+    Impl::FillView_3D<Kokkos::LayoutLeft, Space> fill;
+    fill.a = a;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) ), fill );
+
+    Impl::CheckSubviewCorrectness_3D_3D< Kokkos::LayoutLeft, Space, MemTraits > check;
+    check.a = a;
+    check.b = b;
+    check.offset_0 = 16;
+    check.offset_2 = 1;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
+  }
+
+  {
+    Kokkos::View< int****, Kokkos::LayoutLeft, Space > a( "A", 100, 4, 5, 3 );
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::pair< int, int >( 1, 3 ), 1 );
+
+    Impl::FillView_4D< Kokkos::LayoutLeft, Space > fill;
+    fill.a = a;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) * a.extent( 3 ) ), fill );
+
+    Impl::CheckSubviewCorrectness_3D_4D< Kokkos::LayoutLeft, Space, MemTraits > check;
+    check.a = a;
+    check.b = b;
+    check.offset_0 = 16;
+    check.offset_2 = 1;
+    check.index = 1;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
+  }
+}
+
+template< class Space, class MemTraits = void >
+void test_layoutright_to_layoutright() {
+  Impl::test_subview_legal_args_right();
+
+  {
+    Kokkos::View< int***, Kokkos::LayoutRight, Space > a( "A", 100, 4, 3 );
+    Kokkos::View< int***, Kokkos::LayoutRight, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::ALL );
+
+    Impl::FillView_3D<Kokkos::LayoutRight, Space> fill;
+    fill.a = a;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) ), fill );
+
+    Impl::CheckSubviewCorrectness_3D_3D< Kokkos::LayoutRight, Space, MemTraits > check;
+    check.a = a;
+    check.b = b;
+    check.offset_0 = 16;
+    check.offset_2 = 0;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
+  }
+
+  {
+    Kokkos::View< int****, Kokkos::LayoutRight, Space > a( "A", 3, 4, 5, 100 );
+    Kokkos::View< int***, Kokkos::LayoutRight, Space > b( a, 1, Kokkos::pair< int, int >( 1, 3 ), Kokkos::ALL, Kokkos::ALL );
+
+    Impl::FillView_4D< Kokkos::LayoutRight, Space > fill;
+    fill.a = a;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) * a.extent( 3 ) ), fill );
+
+    Impl::CheckSubviewCorrectness_3D_4D< Kokkos::LayoutRight, Space, MemTraits > check;
+    check.a = a;
+    check.b = b;
+    check.offset_0 = 1;
+    check.offset_2 = 0;
+    check.index = 1;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class Space >
+struct TestUnmanagedSubviewReset
+{
+  Kokkos::View<int****,Space> a ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ) const noexcept
+    {
+      auto sub_a = Kokkos::subview(a,0,Kokkos::ALL,Kokkos::ALL,Kokkos::ALL);
+
+      for ( int i = 0 ; i < int(a.extent(0)) ; ++i ) {
+        sub_a.assign_data( & a(i,0,0,0) );
+        if ( & sub_a(1,1,1) != & a(i,1,1,1) ) {
+          Kokkos::abort("TestUnmanagedSubviewReset");
+        }
+      }
+    }
+
+  TestUnmanagedSubviewReset()
+    : a( Kokkos::view_alloc() , 20 , 10 , 5 , 2 )
+    {}
+};
+
+template< class Space >
+void test_unmanaged_subview_reset()
+{
+  Kokkos::parallel_for
+    ( Kokkos::RangePolicy< typename Space::execution_space >(0,1)
+    , TestUnmanagedSubviewReset<Space>()
+    );
+}
+
+} // namespace TestViewSubview
+
+#endif
+
diff --git a/packages/kokkos/core/unit_test/TestWorkGraph.hpp b/packages/kokkos/core/unit_test/TestWorkGraph.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7bbb2759fa9c77bf80d9548e2fe4cb7b0e98386
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestWorkGraph.hpp
@@ -0,0 +1,171 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <vector>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace {
+
+/* This test is meant to be the WorkGraph equivalent of the Task DAG Scheduler test,
+   please see TestTaskScheduler.hpp for that test.
+   The algorithm computes the N-th fibonacci number as follows:
+    - Each "task" or "work item" computes the i-th fibonacci number
+    - If a task as (i < 2), it will record the known answer ahead of time.
+    - If a taks has (i >= 2), it will "spawn" two more tasks to compute
+      the (i - 1) and (i - 2) fibonacci numbers.
+      We do NOT do any de-duplication of these tasks.
+      De-duplication would result in only (N - 2) tasks which must be run in serial.
+      We allow duplicates both to increase the number of tasks and to increase the
+      amount of available parallelism.
+ */
+
+template< class ExecSpace >
+struct TestWorkGraph {
+
+  using MemorySpace = typename ExecSpace::memory_space;
+  using Policy = Kokkos::WorkGraphPolicy<std::int32_t, ExecSpace>;
+  using Graph = typename Policy::graph_type;
+  using RowMap = typename Graph::row_map_type;
+  using Entries = typename Graph::entries_type;
+  using Values = Kokkos::View<long*, MemorySpace>;
+
+  long m_input;
+  Graph m_graph;
+  Graph m_transpose;
+  Values m_values;
+
+  TestWorkGraph(long arg_input):m_input(arg_input) {
+    form_graph();
+    transpose_crs(m_transpose, m_graph);
+  }
+
+  inline
+  long full_fibonacci( long n ) {
+    constexpr long mask = 0x03;
+    long fib[4] = { 0, 1, 1, 2 };
+    for ( long i = 2; i <= n; ++i ) {
+      fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ];
+    }
+    return fib[ n & mask ];
+  }
+
+  struct HostEntry {
+    long input;
+    std::int32_t parent;
+  };
+  std::vector<HostEntry> form_host_graph() {
+    std::vector<HostEntry> g;
+    g.push_back({ m_input , -1 });
+    for (std::int32_t i = 0; i < std::int32_t(g.size()); ++i) {
+      auto e = g.at(std::size_t(i));
+      if (e.input < 2) continue;
+      /* This part of the host graph formation is the equivalent of task spawning
+         in the Task DAG system. Notice how each task which is not a base case
+         spawns two more tasks, without any de-duplication */
+      g.push_back({ e.input - 1, i });
+      g.push_back({ e.input - 2, i });
+    }
+    return g;
+  }
+
+  void form_graph() {
+    auto hg = form_host_graph();
+    m_graph.row_map = RowMap("row_map", hg.size() + 1); // row map always has one more
+    m_graph.entries = Entries("entries", hg.size() - 1); // all but the first have a parent
+    m_values = Values("values", hg.size());
+    //printf("%zu work items\n", hg.size());
+    auto h_row_map = Kokkos::create_mirror_view(m_graph.row_map);
+    auto h_entries = Kokkos::create_mirror_view(m_graph.entries);
+    auto h_values = Kokkos::create_mirror_view(m_values);
+    h_row_map(0) = 0;
+    for (std::int32_t i = 0; i < std::int32_t(hg.size()); ++i) {
+      auto& e = hg.at(std::size_t(i));
+      h_row_map(i + 1) = i;
+      if (e.input < 2) {
+        h_values(i) = e.input;
+      }
+      if (e.parent == -1) continue;
+      h_entries(i - 1) = e.parent;
+    }
+    Kokkos::deep_copy(m_graph.row_map, h_row_map);
+    Kokkos::deep_copy(m_graph.entries, h_entries);
+    Kokkos::deep_copy(m_values, h_values);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(std::int32_t i) const {
+    auto begin = m_transpose.row_map(i);
+    auto end = m_transpose.row_map(i + 1);
+    for (auto j = begin; j < end; ++j) {
+      auto k = m_transpose.entries(j);
+      m_values(i) += m_values( k );
+    }
+  }
+
+  void test_for() {
+    Kokkos::parallel_for(Policy(m_graph), *this);
+    auto h_values = Kokkos::create_mirror_view(m_values);
+    Kokkos::deep_copy(h_values, m_values);
+    ASSERT_EQ( h_values(0), full_fibonacci(m_input) );
+  }
+
+};
+
+} // anonymous namespace
+
+TEST_F( TEST_CATEGORY, workgraph_fib )
+{
+  int limit = 27;
+  for ( int i = 0; i < limit; ++i) {
+    TestWorkGraph< TEST_EXECSPACE > f(i);
+    f.test_for();
+  }
+  //TestWorkGraph< TEST_EXECSPACE > f(2);
+  //f.test_for();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/UnitTestConfig.make b/packages/kokkos/core/unit_test/UnitTestConfig.make
new file mode 100644
index 0000000000000000000000000000000000000000..97f4af5a8b7d9eeb7568363eeca17b0cca2f8533
--- /dev/null
+++ b/packages/kokkos/core/unit_test/UnitTestConfig.make
@@ -0,0 +1,52 @@
+KOKKOS_PATH = ../..
+
+# See $(KOKKOS_PATH)/Makefile.kokkos and $(KOKKOS_PATH)/generate_makefile.bash
+KOKKOS_ARCH_OPTIONS="None AMDAVX ARMv80 ARMv81 ARMv8-ThunderX \
+	 BGQ Power7 Power8 Power9 \
+	 WSM SNB HSW BDW SKX KNC KNL \
+     Kepler Kepler30 Kepler32 Kepler35 Kepler37 \
+     Maxwell Maxwell50 Maxwell52 Maxwell53 Pascal60 Pascal61"
+#KOKKOS_ARCH_OPTIONS="AMDAVX"
+
+KOKKOS_DEVICE_OPTIONS="Cuda ROCm OpenMP Pthread Serial Qthreads"
+#KOKKOS_DEVICE_OPTIONS="Cuda"
+
+# Configure paths to enable environment query in Makefile.kokkos to work
+ROCM_HCC_PATH="config"
+CXX="./config/cxx"
+ipath=env CXX=$(CXX) env PATH=./config:$$PATH env ROCM_HCC_PATH=$(ROCM_HCC_PATH)
+
+# Defined in core/src/Makefile -- this should be consistent
+KOKKOS_MAKEFILE=Makefile.kokkos
+KOKKOS_CMAKEFILE=kokkos_generated_settings.cmake
+
+# Defined in Makefile.kokkos -- this should be consistent
+KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
+KOKKOS_CONFIG_HEADER=KokkosCore_config.h
+
+d='\#'
+
+# diff => 0 is no difference.  if => 0 is false
+testmake=if test "`testmake.sh $1 $2 $3`" = 'Passed'; then echo OK $d $1; else echo not OK $d $1; fi
+testconf=if test "`diffconfig.sh $1`" = 'Passed'; then echo OK $d $1; else echo not OK $d $1; fi
+
+# testing tmp and cmakefile files is unnecessary here
+test:
+	@for karch in "$(KOKKOS_ARCH_OPTIONS)"; do \
+	  for device in "$(KOKKOS_DEVICE_OPTIONS)"; do \
+	     $(ipath) KOKKOS_DEVICES=$$device KOKKOS_ARCH=$$karch make -e -f ../src/Makefile build-makefile-cmake-kokkos; \
+		 rm -f $(KOKKOS_INTERNAL_CONFIG_TMP) $(KOKKOS_CMAKEFILE); \
+		 prfx="$$karch"_"$$device"_; \
+		 newmake="$$prfx"$(KOKKOS_MAKEFILE);  \
+		 newconf="$$prfx"$(KOKKOS_CONFIG_HEADER); \
+		 mv $(KOKKOS_MAKEFILE)      config/tmpstore/$$newmake; \
+		 mv $(KOKKOS_CONFIG_HEADER) config/tmpstore/$$newconf; \
+		 $(call testmake,$$newmake,$$karch,$$device); \
+		 $(call testconf,$$newconf); \
+	  done; \
+	done
+
+test-cmake:
+	@cd config/cmaketest; \
+     cmake .             ; \
+     make test
diff --git a/packages/kokkos/core/unit_test/UnitTestMain.cpp b/packages/kokkos/core/unit_test/UnitTestMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5f9091eced47c420c1055c4826dbc0025bd23ca2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/UnitTestMain.cpp
@@ -0,0 +1,50 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <cstdlib>
+
+int main( int argc, char *argv[] ) {
+  ::testing::InitGoogleTest( &argc, argv );
+  return RUN_ALL_TESTS();
+}
diff --git a/packages/kokkos/core/unit_test/UnitTestMainInit.cpp b/packages/kokkos/core/unit_test/UnitTestMainInit.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82c8ee89958d1a320a66e6e9ad31163e359ed919
--- /dev/null
+++ b/packages/kokkos/core/unit_test/UnitTestMainInit.cpp
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include <cstdlib>
+
+#include <Kokkos_Core.hpp>
+
+int main( int argc, char *argv[] ) {
+  Kokkos::initialize(argc,argv);
+  ::testing::InitGoogleTest( &argc, argv );
+
+  int result =  RUN_ALL_TESTS();
+  Kokkos::finalize();
+  return result;
+}
diff --git a/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook.cpp b/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97a7f9df26261e4a6bebfc421aa53ef98c683ed7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook.cpp
@@ -0,0 +1,139 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdlib>
+#include <exception>
+#include <iostream>
+#include <sstream>
+#include <Kokkos_Core.hpp>
+
+namespace { // (anonymous)
+
+// Output for the finalize hooks.  Use this to make sure that all the
+// hooks ran, and that they ran in the correct order.
+std::ostringstream hookOutput;
+
+const char hook1str[] = "Behold, I am Hook 1; first pushed, last to be called.";
+const char hook2str[] = "Yea verily, I am Hook 2.";
+const char hook3str[] = "Indeed, I am Hook 3.";
+const char hook4str[] = "Last but not least, I am Hook 4.";
+
+} // namespace (anonymous)
+
+// Don't just have all the hooks print the same thing except for a
+// number.  Have them print different things, so we can detect
+// interleaving.  The hooks need to run sequentially, in LIFO order.
+// Also, make sure that the function accepts at least the following
+// kinds of hooks:
+//
+// 1. A plain old function that takes no arguments and returns nothing.
+// 2. Lambda, that can be assigned to std::function<void()>
+// 3. An actual std::function<void()>
+// 4. A named object with operator().  This is what C++ programmers
+//    unfortunately like to call "functor," even though this word
+//    means something different in other languages.
+
+void hook1 () {
+  hookOutput << hook1str << std::endl;
+}
+
+struct Hook4 {
+  void operator () () const {
+    hookOutput << hook4str << std::endl;
+  }
+};
+
+int main( int argc, char *argv[] ) {
+  using std::cout;
+  using std::endl;
+
+  const std::string expectedOutput ([] {
+      std::ostringstream os;
+      os << hook4str << endl
+         << hook3str << endl
+         << hook2str << endl
+         << hook1str << endl;
+      return os.str();
+    }());
+
+  Kokkos::initialize(argc, argv);
+
+  Kokkos::push_finalize_hook(hook1); // plain old function
+  Kokkos::push_finalize_hook ([] {
+      hookOutput << hook2str << endl;
+    }); // lambda
+  std::function<void()> hook3 = [] {
+    hookOutput << hook3str << endl;
+  };
+  Kokkos::push_finalize_hook(hook3); // actual std::function
+  Hook4 hook4;
+  Kokkos::push_finalize_hook(hook4); // function object instance
+
+  // This should invoke the finalize hooks in reverse order.
+  // Furthermore, it should not throw an exception.
+  try {
+    Kokkos::finalize();
+  }
+  catch (std::exception& e) {
+    cout << "FAILED: Kokkos::finalize threw an exception: " << e.what() << endl;
+    return EXIT_FAILURE;
+  }
+  catch (...) {
+    cout << "FAILED: Kokkos::finalize threw an exception whose base class "
+      "is not std::exception." << endl;
+    return EXIT_FAILURE;
+  }
+
+  const bool success = (hookOutput.str() == expectedOutput);
+  if (success) {
+    cout << "SUCCESS" << endl;
+  }
+  else {
+    cout << "FAILED:" << endl
+         << "  Expected output:" << endl
+         << expectedOutput << endl
+         << "  Actual output:" << endl
+         << hookOutput.str() << endl;
+  }
+  return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook_terminate.cpp b/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook_terminate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50b7a3f2850a1d8518c22aad064e5203d02c6a52
--- /dev/null
+++ b/packages/kokkos/core/unit_test/UnitTest_PushFinalizeHook_terminate.cpp
@@ -0,0 +1,86 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdlib>
+#include <iostream>
+#include <exception>
+#include <Kokkos_Core.hpp>
+
+// If any of the finalize hooks given to Kokkos::push_finalize_hook
+// throws but does not catch an exception, make sure that
+// Kokkos::finalize calls std::terminate.
+
+namespace { // (anonymous)
+
+// If you change this, change CMakeLists.txt in this directory too!
+// I verified that changing this string makes the test fail.
+const char my_terminate_str[] = "PASSED: I am the custom std::terminate handler.";
+
+// Tell compilers not to complain that this function doesn't return.
+[[ noreturn ]] void my_terminate_handler ()
+{
+  std::cerr << my_terminate_str << std::endl;
+  std::abort(); // terminate handlers normally would end by calling this
+}
+
+} // namespace (anonymous)
+
+int main(int argc, char *argv[])
+{
+  // If std::terminate is called, it will call my_terminate_handler.
+  std::set_terminate (my_terminate_handler);
+
+  Kokkos::initialize(argc, argv);
+  Kokkos::push_finalize_hook([] {
+      throw std::runtime_error ("I am an uncaught exception!");
+    });
+
+  // This should call std::terminate, which in turn will call
+  // my_terminate_handler above.  That will print the message that
+  // makes this test count as passed.
+  Kokkos::finalize();
+
+  // The test actually failed if we got to this point.
+  std::cerr << "FAILED to call std::terminate!" << std::endl;
+  return EXIT_FAILURE;
+}
diff --git a/packages/kokkos/core/unit_test/config/bin/hcc-config b/packages/kokkos/core/unit_test/config/bin/hcc-config
new file mode 100755
index 0000000000000000000000000000000000000000..fc09138bcc607eb0b750ecd5c6f20c591885f57e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/bin/hcc-config
@@ -0,0 +1,2 @@
+#!/bin/sh
+echo "--foo --bar"
diff --git a/packages/kokkos/core/unit_test/config/clang b/packages/kokkos/core/unit_test/config/clang
new file mode 100755
index 0000000000000000000000000000000000000000..34c69194106abcee50cfcbc69a0e789544dd6396
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/clang
@@ -0,0 +1,5 @@
+#!/bin/sh
+echo="Apple LLVM version 8.1.0 (clang-802.0.42)"
+echo="Target: x86_64-apple-darwin16.7.0"
+echo="Thread model: posix"
+echo="InstalledDir: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin"
diff --git a/packages/kokkos/core/unit_test/config/cmaketest/CMakeLists.txt b/packages/kokkos/core/unit_test/config/cmaketest/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..54a4c4a74a19d0699f406f602a9930cc46674ab8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/cmaketest/CMakeLists.txt
@@ -0,0 +1,80 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+project(Kokkos CXX)
+
+enable_testing()
+
+# Initialization
+get_filename_component(KOKKOS_TESTDIR ${CMAKE_SOURCE_DIR}/../.. REALPATH)
+get_filename_component(KOKKOS_SRCDIR ${CMAKE_SOURCE_DIR}/../../../.. REALPATH)
+set(KOKKOS_SRC_PATH ${KOKKOS_SRCDIR})
+set(KOKKOS_PATH ${KOKKOS_SRC_PATH})
+
+set(CXX ${KOKKOS_TESTDIR}/config/cxx)
+
+# Defined in core/src/Makefile -- this should be consistent
+set(KOKKOS_MAKEFILE Makefile.kokkos)
+set(KOKKOS_CMAKEFILE kokkos_generated_settings.cmake)
+
+# Defined in Makefile.kokkos -- this should be consistent
+set(KOKKOS_INTERNAL_CONFIG_TMP KokkosCore_config.tmp)
+set(KOKKOS_CONFIG_HEADER KokkosCore_config.h)
+
+set(KOKKOS_CMAKE_VERBOSE False)
+include(${KOKKOS_SRCDIR}/cmake/kokkos_options.cmake)
+foreach(KOKKOS_DEV ${KOKKOS_DEVICES_LIST})
+# Do some initialization: Want to turn everything off for testing
+  string(TOUPPER ${KOKKOS_DEV} KOKKOS_DEVUC)
+  set(KOKKOS_ENABLE_${KOKKOS_DEVUC} OFF)
+endforeach()
+
+
+#TEST set(KOKKOS_HOST_ARCH_LIST ARMv80)
+#TEST set(KOKKOS_DEVICES_LIST Cuda)
+#set(KOKKOS_HOST_ARCH_LIST AMDAVX)
+#set(KOKKOS_DEVICES_LIST Cuda)
+
+foreach(KOKKOS_HOST_ARCH ${KOKKOS_HOST_ARCH_LIST})
+  foreach(KOKKOS_DEV ${KOKKOS_DEVICES_LIST})
+    string(TOUPPER ${KOKKOS_DEV} KOKKOS_DEVUC)
+    set(KOKKOS_ENABLE_${KOKKOS_DEVUC} On)
+
+    set(KOKKOS_CMAKE_VERBOSE True)
+    include(${KOKKOS_SRCDIR}/cmake/kokkos_options.cmake)
+    set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} ROCM_HCC_PATH=${KOKKOS_TESTDIR}/config)
+
+    #message(STATUS "${KOKKOS_SETTINGS} make -f ${KOKKOS_SRCDIR}/core/src/Makefile build-makefile-cmake-kokkos")
+    execute_process(
+          COMMAND ${KOKKOS_SETTINGS} make -f ${KOKKOS_SRCDIR}/core/src/Makefile build-makefile-cmake-kokkos
+          WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+          OUTPUT_FILE ${CMAKE_BINARY_DIR}/core_src_make.out
+          RESULT_VARIABLE res
+        )
+    #message(STATUS "RESULT ${res}")
+
+    file(REMOVE ${KOKKOS_INTERNAL_CONFIG_TMP} ${KOKKOS_MAKEFILE})
+    set(PREFIX "${KOKKOS_HOST_ARCH}_${KOKKOS_DEV}_")
+    set(NEWCMAKE ${PREFIX}${KOKKOS_CMAKEFILE})
+    set(NEWCONFH ${PREFIX}${KOKKOS_CONFIG_HEADER})
+    file(RENAME ${KOKKOS_CMAKEFILE} ${NEWCMAKE})
+    file(RENAME ${KOKKOS_CONFIG_HEADER} ${NEWCONFH})
+      
+    add_test(NAME ${NEWCMAKE}-test 
+        COMMAND ${KOKKOS_TESTDIR}/testmake.sh ${NEWCMAKE}  ${KOKKOS_HOST_ARCH} ${KOKKOS_DEV}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      )
+    set_tests_properties(${NEWCMAKE}-test 
+        PROPERTIES PASS_REGULAR_EXPRESSION Passed
+        TIMEOUT 15
+      )
+   add_test(NAME ${NEWCONFH}-test 
+        COMMAND ${KOKKOS_TESTDIR}/diffconfig.sh ${NEWCONFH} 
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      )
+   set_tests_properties(${NEWCONFH}-test 
+        PROPERTIES PASS_REGULAR_EXPRESSION Passed
+        TIMEOUT 15
+      )
+    set(KOKKOS_ENABLE_${KOKKOS_DEVUC} Off)
+
+  endforeach()
+endforeach()
diff --git a/packages/kokkos/core/unit_test/config/cxx b/packages/kokkos/core/unit_test/config/cxx
new file mode 100755
index 0000000000000000000000000000000000000000..f25d7714a5aab6debd20424661d2bc92add9b571
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/cxx
@@ -0,0 +1,5 @@
+#!/bin/sh
+echo "g++ (GCC) 6.3.1 20161221 (Red Hat 6.3.1-1)"
+echo "Copyright (C) 2016 Free Software Foundation, Inc."
+echo "This is free software; see the source for copying conditions.  There is NO"
+echo "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
diff --git a/packages/kokkos/core/unit_test/config/mpic++ b/packages/kokkos/core/unit_test/config/mpic++
new file mode 100755
index 0000000000000000000000000000000000000000..f25d7714a5aab6debd20424661d2bc92add9b571
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/mpic++
@@ -0,0 +1,5 @@
+#!/bin/sh
+echo "g++ (GCC) 6.3.1 20161221 (Red Hat 6.3.1-1)"
+echo "Copyright (C) 2016 Free Software Foundation, Inc."
+echo "This is free software; see the source for copying conditions.  There is NO"
+echo "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
diff --git a/packages/kokkos/core/unit_test/config/nvcc b/packages/kokkos/core/unit_test/config/nvcc
new file mode 100755
index 0000000000000000000000000000000000000000..b5bcbf234cf08d351f63aaa176a25a98308131d2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/nvcc
@@ -0,0 +1,5 @@
+#!/bin/sh
+echo "nvcc: NVIDIA (R) Cuda compiler driver"
+echo "Copyright (c) 2005-2016 NVIDIA Corporation"
+echo "Built on Tue_Jan_10_13:22:03_CST_2017"
+echo "Cuda compilation tools, release 8.0, V8.0.61"
diff --git a/packages/kokkos/core/unit_test/config/results/AMDAVX_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/AMDAVX_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a737a3b2fec1339cc9907b28206e4ee2c0ab0e9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/AMDAVX_Cuda_KokkosCore_config.h
@@ -0,0 +1,18 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:09 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/AMDAVX_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/AMDAVX_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a704e41852fa7f57b255ab97ae320f51e9936af
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/AMDAVX_OpenMP_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:10 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/AMDAVX_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/AMDAVX_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..c478a5c252ae406cae5e69dd6e1f7d06f83b1055
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/AMDAVX_Pthread_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:10 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/AMDAVX_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/AMDAVX_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb5d2146300a1c256f054d74f369af05a89c9f5e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/AMDAVX_Qthreads_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:11 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/AMDAVX_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/AMDAVX_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b7e2b815344eb455bc66c27cdbe0d0c457b8817
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/AMDAVX_ROCm_KokkosCore_config.h
@@ -0,0 +1,18 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:09 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/AMDAVX_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/AMDAVX_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..9930bacc47ddd91c4bd468b8d7d3cd5a3f918e51
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/AMDAVX_Serial_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:11 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f172c00e42f0c767fdba70b05930617cf42ce27
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:17 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
+#define KOKKOS_ARCH_ARMV8_THUNDERX 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..d25b832ca2ba038018dadd1e1a93e26cb7cf5614
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_OpenMP_KokkosCore_config.h
@@ -0,0 +1,18 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:18 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
+#define KOKKOS_ARCH_ARMV8_THUNDERX 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd3a603092381bdcf67f8153e7cea7558a5ffc65
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Pthread_KokkosCore_config.h
@@ -0,0 +1,18 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:19 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
+#define KOKKOS_ARCH_ARMV8_THUNDERX 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..3865bc4a9a78f8de925bf1a9f12e64677cb6a4c3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Qthreads_KokkosCore_config.h
@@ -0,0 +1,18 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:20 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
+#define KOKKOS_ARCH_ARMV8_THUNDERX 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..86b9f845859ad4239613299707c1e75bb1d0c3ab
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_ROCm_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:18 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
+#define KOKKOS_ARCH_ARMV8_THUNDERX 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..75ada8c01ffcef1e8aa732ecbddf278cd65885a6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Serial_KokkosCore_config.h
@@ -0,0 +1,18 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:19 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
+#define KOKKOS_ARCH_ARMV8_THUNDERX 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv80_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv80_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..796c0aab6537565a054fe41d3e6a0276914ea456
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv80_Cuda_KokkosCore_config.h
@@ -0,0 +1,18 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:12 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv80_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv80_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcf7ff7ea29c7ca1179ddc4098b592a97343842b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv80_OpenMP_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:13 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv80_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv80_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..298966b6d4d4f101a0f9547c95bcecc5e6e16998
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv80_Pthread_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:14 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv80_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv80_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..7259a9e9641c999ca267fe68c2b1103db3c0f7b3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv80_Qthreads_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:14 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv80_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv80_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2b4f146cbb526d91853208ec54039a850ec8702
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv80_ROCm_KokkosCore_config.h
@@ -0,0 +1,18 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:12 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv80_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv80_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe5fe6644513bdfc65b2f58de4ac32f4b136a033
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv80_Serial_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:14 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV80 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv81_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv81_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d02142438bd717fc2317d9b3f65102b408ff821
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv81_Cuda_KokkosCore_config.h
@@ -0,0 +1,18 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:15 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV81 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv81_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv81_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa194c77be747dab77c87a3a723eeaf6b3faec28
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv81_OpenMP_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:16 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV81 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv81_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv81_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d2dbeeef4869197888c1e0a3d3aa1274ce308bd
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv81_Pthread_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:16 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV81 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv81_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv81_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9fc71ad9b6e3bc87104ed9e01ff77962c6454d7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv81_Qthreads_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:17 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV81 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv81_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv81_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..28a56596b4bffaabc1ca827e53638dc79ecfe6f7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv81_ROCm_KokkosCore_config.h
@@ -0,0 +1,18 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:15 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV81 1
diff --git a/packages/kokkos/core/unit_test/config/results/ARMv81_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/ARMv81_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d29fd139050e792af20b7b575fa03cbe473886a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/ARMv81_Serial_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:16 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_ARMV81 1
diff --git a/packages/kokkos/core/unit_test/config/results/BDW_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BDW_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce2582b23ff3971072f78595d4407116c2f2acd9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BDW_Cuda_KokkosCore_config.h
@@ -0,0 +1,24 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:37 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/BDW_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BDW_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..118d1b225f4c223e2efb353738599eecd2c20f34
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BDW_OpenMP_KokkosCore_config.h
@@ -0,0 +1,23 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:38 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/BDW_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BDW_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d0215baf6de357cb7a338d34beb0c62fb2744d2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BDW_Pthread_KokkosCore_config.h
@@ -0,0 +1,23 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:38 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/BDW_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BDW_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..e879e7e1fe97c136d8a9b9b918ef12ed6fa8a7d9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BDW_Qthreads_KokkosCore_config.h
@@ -0,0 +1,23 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:39 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/BDW_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BDW_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f86d055afd7572dc585ed4945c48f8dbee4cbcb
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BDW_ROCm_KokkosCore_config.h
@@ -0,0 +1,24 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:37 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/BDW_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BDW_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..fba671ab1a512594582bc78caa1bbc63060550f9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BDW_Serial_KokkosCore_config.h
@@ -0,0 +1,23 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:39 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/BGQ_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BGQ_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..93c74d41e2375848b3a18fbc8bb113735ead3a62
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BGQ_Cuda_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:43 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/BGQ_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BGQ_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..533da160285b7c68f2597e38aa4591e71e740062
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BGQ_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:43 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/BGQ_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BGQ_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..9524c94f2b6cf6529024681240b2cffed3ef6895
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BGQ_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:44 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/BGQ_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BGQ_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..60c7ddcdb5ee17892128e8708df6ec3a01f7184c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BGQ_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:44 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/BGQ_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BGQ_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5bc1f54a9a7c0defc00327fdb97c552f75f661a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BGQ_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:44 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/BGQ_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/BGQ_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..8372c006994415777849a768e61fa9318796591c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/BGQ_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:44 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/HSW_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/HSW_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bbe9fa84c1241992cf17b439122af700d6eeb45
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/HSW_Cuda_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:34 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/HSW_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/HSW_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..17f75872f84fddc42f0a19aa3f6c8b000b62c3a1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/HSW_OpenMP_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:35 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/HSW_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/HSW_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..5df1be17ada1d2bcc5ef6d21820d752381752d57
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/HSW_Pthread_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:35 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/HSW_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/HSW_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..253dc35bdfd023ea433d70f9851354f012b936a0
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/HSW_Qthreads_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:36 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/HSW_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/HSW_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e04801b862efd26463845d0785518b287379480
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/HSW_ROCm_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:35 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/HSW_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/HSW_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..99f76aff0b1fe147004233e2bf401acf605f7f49
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/HSW_Serial_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:36 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX2 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNC_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNC_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdc270fd0d795dc2a8981371bd79ed22a041d7d5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNC_Cuda_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:42 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_KNC
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_KNC 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNC_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNC_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9b79f552d7fd734d574827a35f5b72d970ceb77
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNC_OpenMP_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:43 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_KNC
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_KNC 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNC_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNC_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..15d9d01a0aaf9123f9d201191c54797becba12b8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNC_Pthread_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:44 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_KNC
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_KNC 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNC_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNC_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f95a83c272fba8de39b91e292307946adf2d062
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNC_Qthreads_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:45 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_KNC
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_KNC 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNC_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNC_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..5991d3065fa81a7e09c18c860232c0ad05e36075
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNC_ROCm_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:43 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_KNC
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_KNC 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNC_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNC_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a8ddecf14eaa74363698d8f895886b1561d08d0
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNC_Serial_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:44 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_KNC
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_KNC 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNL_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNL_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd7e2ca330e6e31b62568ae185ea1303c00d54cc
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNL_Cuda_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:45 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512MIC 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNL_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNL_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f567f241c5bad2955921d22b935b89693556a6d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNL_OpenMP_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:46 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512MIC 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNL_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNL_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..1cf3f0997a9b4d83f99dbba59a548a8aee8d05b2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNL_Pthread_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:47 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512MIC 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNL_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNL_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d179d82f8eb3601b289caaad130079145c53d89
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNL_Qthreads_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:48 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512MIC 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNL_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNL_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae2938e34a61654813e94ba4f352a0b0eb3a5b0c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNL_ROCm_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:46 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512MIC 1
diff --git a/packages/kokkos/core/unit_test/config/results/KNL_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/KNL_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..21f6e7e4343be324f0d6a640661814098af5b5d7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/KNL_Serial_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:47 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512MIC 1
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler30_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler30_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..78e9335e2486f554dfa1e35d5fb289e2e9221ccf
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler30_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:48 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_KEPLER 1
+#define KOKKOS_ARCH_KEPLER30 1
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler30_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler30_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..769d9c878994f7454e2ac4186b1b0ef9f69ac0ec
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler30_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:49 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler30_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler30_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cc728a5e3a03f5e98998894ebb02de4fe50551a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler30_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:49 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler30_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler30_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..410ba5ea1599c4d4dda57f0853e58572fb1a35fe
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler30_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:50 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler30_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler30_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..34867aa91ee1fee85f65e4bd21c0975ad1a5105b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler30_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:48 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler30_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler30_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..54943b244f443d212c4810503caf0441dfce255e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler30_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:50 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler32_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler32_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7e23d503cf1fc6206549537300b1b059e5ec236
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler32_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:50 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_KEPLER 1
+#define KOKKOS_ARCH_KEPLER32 1
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler32_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler32_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcfbf97ef2aa49f53344d41dde7bb8e02ff2ca4f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler32_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:51 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler32_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler32_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cea100aa419ab02d76bb05c6e00ab22530823d8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler32_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:52 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler32_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler32_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f42d0cc5f2fb11a336d37ee7588bd010fdbf4d0e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler32_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:53 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler32_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler32_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ae47b6976894a0a7969bb2098c8ebe185bbf5e6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler32_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:51 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler32_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler32_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d20b1dc811f44d8c3c5e553a8f266aaa12c0231
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler32_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:52 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler35_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler35_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7935927c39b2abd4929c2a1364b74955acc9f21
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler35_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:53 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_KEPLER 1
+#define KOKKOS_ARCH_KEPLER35 1
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler35_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler35_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..02777df40a32f6bf45084a158b6901891fa858ff
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler35_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:54 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler35_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler35_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f51f00ce957a8e55d9dab99dfadb9dd4c2b623e8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler35_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:55 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler35_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler35_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..429f5e9e28df4de2a5c559bf8e4b20fe623653cf
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler35_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:55 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler35_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler35_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..111bb09340da44fb4085a546a5348892271607da
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler35_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:54 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler35_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler35_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..da61dabb5809e6e1a4c8c257e4df65d26abc69fd
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler35_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:55 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler37_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler37_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..c70ce2e04cfaba2c759ee49eee6bc0c4fa015aea
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler37_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:56 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_KEPLER 1
+#define KOKKOS_ARCH_KEPLER37 1
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler37_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler37_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8c6c74832520dd5cb51aa0652bebf78041db7c4
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler37_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:57 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler37_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler37_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..b832ef36e5afb5ab942a9e6340374a845a208283
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler37_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:58 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler37_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler37_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b8a7f818350f4861cc71e9d65ce9235ef84dc00
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler37_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:59 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler37_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler37_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a661f8842166b7b45c84daf69288d385650cd2d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler37_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:57 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler37_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler37_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..469f3d96a77e3bc5b0111fdb037e1a1ebc621a5f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler37_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:58 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ccf1bef54342d6d094719157f7efd490bc13bd6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:50 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_KEPLER 1
+#define KOKKOS_ARCH_KEPLER35 1
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d87c958a216980f1de929074ff78f446fb76e12
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:51 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..263870be9fb15c18460e8200abe7a704895733df
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:51 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..021d18c0025b208c04c402e47b93e3251f37382b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:51 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..2826fdfb885bdf97a71e27493ec5c49e72af1406
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:52 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Kepler_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Kepler_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..69097e034d2b8aa7d972290746944e3830176caf
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Kepler_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:52 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell50_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell50_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..fac64e9e9835066042607888756e1690065c5ea7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell50_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:59 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_MAXWELL 1
+#define KOKKOS_ARCH_MAXWELL50 1
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell50_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell50_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f5b3eea1355651a5e7b7e5286f7adee68da667d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell50_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:00 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell50_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell50_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..b249c88be52b05de33c243d4915ef4389c90498c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell50_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:01 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell50_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell50_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..be1353365c71f9681ff2b281d94a36016523f39c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell50_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:02 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell50_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell50_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce9f67d5be7dbab69e05133d7b6682d83554dc71
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell50_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:00 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell50_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell50_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8c6be139e288e7dd7b1164ca4dc3893527ee93a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell50_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:02 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell52_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell52_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce28f3e4b783bc4496915b60ed7bba3f333f1873
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell52_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:03 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_MAXWELL 1
+#define KOKKOS_ARCH_MAXWELL52 1
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell52_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell52_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..35635063a5b794d39fa5782aac7f7f1fc7beb791
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell52_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:04 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell52_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell52_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..140740f81f6e1c907c174bef2137d821d66d3bf1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell52_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:04 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell52_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell52_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..58a043c6a3ed49a89b8d29a3a9ba857a6174cc81
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell52_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:05 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell52_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell52_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..06ff6935caa79a2a43d4a43c94fca851065c9615
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell52_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:03 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell52_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell52_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..eac120d0617d73927b41ebbf27f270d1e34db770
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell52_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:05 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell53_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell53_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad8344a09941bb35934c05448a32409f0fd9467b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell53_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:06 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_MAXWELL 1
+#define KOKKOS_ARCH_MAXWELL53 1
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell53_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell53_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab1e801267c83d88230022a98bd683724a61655a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell53_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:06 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell53_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell53_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b1e3bf311461cae737b924cad781c8479c736a0
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell53_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:07 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell53_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell53_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..96fdbef3dcbe8960e8d0db572321f88b0d3eccc5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell53_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:08 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell53_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell53_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..82414cf358de0059c0055fe643d5af2e6f6af6f5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell53_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:06 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell53_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell53_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..b10b80b3bc6330ecab29d302af8e77a468f2d46c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell53_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:07 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..d81a715007229beb716c329a8c45ac8150d7f678
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:20:00 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_MAXWELL 1
+#define KOKKOS_ARCH_MAXWELL50 1
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..98e93c7b28d9110700b30a5a3a6fd204057301cb
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:20:00 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..47a7ccb7a57b2b59437b7e1f1a968080937f4595
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:20:00 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7f1fd3803bfffd03caaca89dcd72a0213ccada2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:20:01 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..c438f4f7d597717a80a60069fb1648651df3968c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:20:01 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Maxwell_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Maxwell_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..d66c569084309e1f484b43733f4f79ec478ffccf
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Maxwell_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:20:01 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/None_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/None_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bf2755fd0bdde9c9aafe3efd72ddff5c64e535f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/None_Cuda_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:22 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/None_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/None_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..4dd2eed180b4ba63a61719bd3a5534b623645985
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/None_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:23 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/None_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/None_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bdd29b6a553c1bf30048efaf8c358375e64b925
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/None_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:23 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/None_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/None_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bd8addd97479b4ffe9aca8cb0bc5fa904d38164
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/None_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:23 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/None_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/None_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..74b0d7335c616f41337f5a08b2f5a35159d3bc91
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/None_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:24 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/None_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/None_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9d0b264b8017afa8dc59acaa6101f8ae921dd86
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/None_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Tue Sep 26 15:19:23 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal60_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal60_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fe1aa698d9c18b6c7a614b82beb95814b320da2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal60_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:08 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_PASCAL 1
+#define KOKKOS_ARCH_PASCAL60 1
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal60_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal60_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..93173f4e118e4ab831e6f0f6d3e61657898d6f51
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal60_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:09 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal60_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal60_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..a05d5729e0f616b2436ec1441cb94623be12cea5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal60_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:09 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal60_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal60_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5a2d1d707ff8effffb4b99bfe8066ef7706ad9f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal60_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:10 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal60_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal60_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c04befef51b0ddb09dc812dfef10378bab8b553
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal60_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:09 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal60_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal60_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6038c2965aca6f7ee2d0f3890e3bb930309117d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal60_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:10 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal61_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal61_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..0de37df9601e7bfc03d27cc60f725d8d6d983391
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal61_Cuda_KokkosCore_config.h
@@ -0,0 +1,19 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:11 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_PASCAL 1
+#define KOKKOS_ARCH_PASCAL61 1
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal61_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal61_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c392cc0df12d3999f2fc53e0292120879f146d1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal61_OpenMP_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:12 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal61_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal61_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f704aa9c81c9af9b47afa698f66cbfea7753455c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal61_Pthread_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:12 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal61_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal61_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..958aac11da7b579c2d7642a2eae5d74952c13423
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal61_Qthreads_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:13 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal61_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal61_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a4d8cc68384421796db37e6db133c02cc7c01df
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal61_ROCm_KokkosCore_config.h
@@ -0,0 +1,17 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:11 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Pascal61_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Pascal61_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fb2cf9e9d4e42e83b9e87c0ac684d9bf06da049
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Pascal61_Serial_KokkosCore_config.h
@@ -0,0 +1,16 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:23:12 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
diff --git a/packages/kokkos/core/unit_test/config/results/Power7_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power7_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..a78e1ffc8d6a2d0a1b25ab0ccc01de63dbd5cebb
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power7_Cuda_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:20 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCBE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER7 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power7_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power7_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd856b80a51e213807cc004f2652cd8e9e963b2a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power7_OpenMP_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:21 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCBE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER7 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power7_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power7_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b3ac2aff9702c8e289154898ca4484daba5164b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power7_Pthread_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:21 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCBE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER7 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power7_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power7_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..dffa8a3f58214b5d7637970bca815e0d22419d3e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power7_Qthreads_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:22 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCBE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER7 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power7_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power7_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..e16cfb37bd8a93a1df7ca8ef9789641e9ec98309
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power7_ROCm_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:20 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCBE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER7 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power7_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power7_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..6831f3ce255df472b63c6fa6928efde5d8849827
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power7_Serial_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:22 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCBE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER7 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power8_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power8_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ab0b04c6ca9950eee482c662bc0f20c7288868e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power8_Cuda_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:23 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER8 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power8_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power8_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..54750405cabd91fc581479eee072111c5f5021bc
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power8_OpenMP_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:24 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER8 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power8_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power8_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d71338d231f064e91ba25282e2e42b98a92abd0
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power8_Pthread_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:24 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER8 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power8_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power8_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..9da90f4f7e823e7c267f790d6c9f588c8d038c5d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power8_Qthreads_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:25 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER8 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power8_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power8_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3fd70b0cf73a63162beb93991cfdcc6043fecf5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power8_ROCm_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:24 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER8 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power8_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power8_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c0ecc22d357a88146fc49dc3ac0d79010104899
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power8_Serial_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:25 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER8 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power9_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power9_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..47d518f407cc9f12e57fd34d5fdefc65ae1bfc99
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power9_Cuda_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:26 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER9 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power9_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power9_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..106bf33e448fa3acb2d29eda836c9588cbd5fb9b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power9_OpenMP_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:27 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER9 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power9_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power9_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..108e5eba4767da21d03d1a6a959c0042116694bd
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power9_Pthread_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:27 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER9 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power9_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power9_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c5be2ed3cbc5328d0c639d58d2f503a0ca8da3a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power9_Qthreads_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:28 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER9 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power9_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power9_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b6a391d95dc8338e4d62d1ce78be9489f46625d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power9_ROCm_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:26 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER9 1
diff --git a/packages/kokkos/core/unit_test/config/results/Power9_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/Power9_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f7aefe62e2556d8b662bb03e6dd4453340dfa00
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/Power9_Serial_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:27 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_POWERPCLE
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_POWER9 1
diff --git a/packages/kokkos/core/unit_test/config/results/SKX_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SKX_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f4380d992a59cef0cee67961b4507ee6f51737c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SKX_Cuda_KokkosCore_config.h
@@ -0,0 +1,24 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:40 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512XEON 1
diff --git a/packages/kokkos/core/unit_test/config/results/SKX_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SKX_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a907a2ae1b4dd6d818d367e840276c8159df6e1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SKX_OpenMP_KokkosCore_config.h
@@ -0,0 +1,23 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:40 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512XEON 1
diff --git a/packages/kokkos/core/unit_test/config/results/SKX_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SKX_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..50a95223c9b36b5925bb9e4d87c5a582eee0ea91
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SKX_Pthread_KokkosCore_config.h
@@ -0,0 +1,23 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:41 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512XEON 1
diff --git a/packages/kokkos/core/unit_test/config/results/SKX_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SKX_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e4b1d61ef92c6580ce2d25e4b2a1855b5e06221
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SKX_Qthreads_KokkosCore_config.h
@@ -0,0 +1,23 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:42 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512XEON 1
diff --git a/packages/kokkos/core/unit_test/config/results/SKX_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SKX_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..12293350a1d803f6c1caad6cef396cc8191be52c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SKX_ROCm_KokkosCore_config.h
@@ -0,0 +1,24 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:40 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512XEON 1
diff --git a/packages/kokkos/core/unit_test/config/results/SKX_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SKX_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ea457aacfd21203ce56ddfedba6493d0bc09f8d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SKX_Serial_KokkosCore_config.h
@@ -0,0 +1,23 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:41 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_ENABLE_TM
+#endif
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX512XEON 1
diff --git a/packages/kokkos/core/unit_test/config/results/SNB_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SNB_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..34c9537834cdf3b54e74f11a8cd43010100f9e26
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SNB_Cuda_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:31 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/SNB_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SNB_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7ed4d720c9c3f0df624f3066f26135eb5ef3b92
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SNB_OpenMP_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:32 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/SNB_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SNB_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..126c29ba77307efb80e5922a60e6af683c494fba
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SNB_Pthread_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:33 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/SNB_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SNB_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f0216f9c4b859024720575e59c5881d3131eb4b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SNB_Qthreads_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:34 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/SNB_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SNB_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c68008bea214a9157f6890ec0db0d11b8a5db95
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SNB_ROCm_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:32 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/SNB_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/SNB_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..0278d0d0791cfdd73ec4e9fc79eb14f6f81a3ab8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/SNB_Serial_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:33 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_AVX 1
diff --git a/packages/kokkos/core/unit_test/config/results/WSM_Cuda_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/WSM_Cuda_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..97389bb1bf3bd6f9fcff1c1ebf761e2acb0d6ff6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/WSM_Cuda_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:28 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_CUDA 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_SSE42 1
diff --git a/packages/kokkos/core/unit_test/config/results/WSM_OpenMP_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/WSM_OpenMP_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd5648f0c81c8a28bbb7e2d71a5b02c9ac3684a9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/WSM_OpenMP_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:29 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_OPENMP 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_SSE42 1
diff --git a/packages/kokkos/core/unit_test/config/results/WSM_Pthread_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/WSM_Pthread_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8a7adbd8908d647fddef4ac0a9f6b137b5aad3e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/WSM_Pthread_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:30 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_PTHREAD 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_SSE42 1
diff --git a/packages/kokkos/core/unit_test/config/results/WSM_Qthreads_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/WSM_Qthreads_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4a78790e399737f15b9f2507bcf41d531281a21
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/WSM_Qthreads_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:31 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_QTHREADS 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_SSE42 1
diff --git a/packages/kokkos/core/unit_test/config/results/WSM_ROCm_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/WSM_ROCm_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..712b5686f0e9e509a6dcb841271a86925175df46
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/WSM_ROCm_KokkosCore_config.h
@@ -0,0 +1,21 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:29 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_ENABLE_ROCM 1
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_SSE42 1
diff --git a/packages/kokkos/core/unit_test/config/results/WSM_Serial_KokkosCore_config.h b/packages/kokkos/core/unit_test/config/results/WSM_Serial_KokkosCore_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bac7c26600ed7a31f89d8cce47019c289b5bf77
--- /dev/null
+++ b/packages/kokkos/core/unit_test/config/results/WSM_Serial_KokkosCore_config.h
@@ -0,0 +1,20 @@
+/* ---------------------------------------------
+Makefile constructed configuration:
+Fri Sep 22 17:22:30 MDT 2017
+----------------------------------------------*/
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
+/* Execution Spaces */
+#define KOKKOS_HAVE_SERIAL 1
+#ifndef __CUDA_ARCH__
+#define KOKKOS_USE_ISA_X86_64
+#endif
+/* General Settings */
+#define KOKKOS_HAVE_CXX11 1
+#define KOKKOS_ENABLE_PROFILING
+/* Optimization Settings */
+/* Cuda Settings */
+#define KOKKOS_ARCH_SSE42 1
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_Category.hpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..98191be7be26c39a7bed2be9b419039320b99dd6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_Category.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class cuda_hostpinned : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY cuda_hostpinned
+#define TEST_EXECSPACE Kokkos::CudaHostPinnedSpace
+
+#endif
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a8a758e67cc03cb5d14f9327c73707685e45775
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestSharedAlloc.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, impl_shared_alloc )
+{
+  test_shared_alloc< TEST_EXECSPACE, Kokkos::DefaultHostExecutionSpace >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2515794de718e4d9376d087ba84bfcf5203be6db
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestViewAPI.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..21ed986b7f0454570301b3f407c6e3d6f33e04f7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestViewMapping_a.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3024cf6e41eb1889ac02e0fc8b5006601fffb96f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestViewMapping_b.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..56c1ee013da0473cf74c0ec11c9aad693689bac1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaHostPinned_Category.hpp>
+#include <TestViewMapping_subview.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_Category.hpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..34b528d04ef550aa97bdf610647ead1f92cfb170
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_Category.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class cuda_uvm : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY cuda_uvm
+#define TEST_EXECSPACE Kokkos::CudaUVMSpace
+
+#endif
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..57a0848527056222c8fd63e521f2c4b4b1646b9a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestSharedAlloc.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, impl_shared_alloc )
+{
+  test_shared_alloc< TEST_EXECSPACE, Kokkos::DefaultHostExecutionSpace >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e3a632619e755314eb799dfe3cb37cb63f498c28
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewAPI.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5be4404c9e32294c2040ed7cc2ac7f40916badc0
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewMapping_a.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2664c472e6b5157969dc12ba45570da3aa3deba2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewMapping_b.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c08e00ba4900324bab6915d6c706941d3d296ae7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewMapping_subview.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_AtomicOperations.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_AtomicOperations.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e20683aa827d94ed3e84923042135475d11b22ee
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_AtomicOperations.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestAtomicOperations.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_AtomicViews.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_AtomicViews.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..debbc10ea77b09d21cff090cd43ef088fbc5f394
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_AtomicViews.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestAtomicViews.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Atomics.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Atomics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1875143f320438d77850ece2df73442ea58e1151
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Atomics.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestAtomic.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Category.hpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..45ea9dcf9481ed50a6644c8257c4f45960f58a88
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Category.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY cuda
+#define TEST_EXECSPACE Kokkos::Cuda
+
+#endif
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Complex.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Complex.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c92f111fcc51e713b701d835d20ec3112d97ee4
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Complex.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestComplex.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Crs.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Crs.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..98f5e3793a91e07a498ed4de6b8841681275ea3d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Crs.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestCrs.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Init.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Init.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97e2867ddc05f8a4bb3197fca88c19b850b18d76
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Init.cpp
@@ -0,0 +1,50 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestInit.hpp>
+#include<TestCompilerMacros.hpp>
+#include<TestPolicyConstruction.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..963a7194b3b89b17b18181d21597cfb7a38d968e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp.cpp
@@ -0,0 +1,85 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cuda/TestCuda_Category.hpp>
+
+namespace Test {
+
+__global__ void offset(int* p) {
+  int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  if(idx<100) {
+    p[idx]+=idx;
+  }
+}
+
+// Test whether allocations survive Kokkos initialize/finalize if done via Raw Cuda.
+TEST_F( cuda, raw_cuda_interop )
+{
+  int* p;
+  cudaMalloc(&p,sizeof(int)*100);
+  Kokkos::InitArguments arguments{-1,-1,-1, false};
+  Kokkos::initialize(arguments);
+
+  Kokkos::View<int*,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+    v(p,100);
+  Kokkos::deep_copy(v,5);
+
+  Kokkos::finalize();
+
+  offset<<<100,64>>>(p);
+  CUDA_SAFE_CALL( cudaDeviceSynchronize());
+
+  int* h_p = new int[100];
+  cudaMemcpy( h_p , p , sizeof(int)*100 , cudaMemcpyDefault );
+  CUDA_SAFE_CALL( cudaDeviceSynchronize());
+  int64_t sum = 0;
+  int64_t sum_expect = 0;
+  for(int i=0; i<100; i++) {
+    sum += h_p[i];
+    sum_expect += 5+i;
+  }
+
+  ASSERT_EQ(sum,sum_expect);
+}
+}
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_MDRange.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_MDRange.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..90a7b33f96318eb5d25dacede258c1b6a5886a6b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_MDRange.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestMDRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Other.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Other.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f63409da29ccd1975dbed248a54724712d0afad9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Other.cpp
@@ -0,0 +1,52 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestTemplateMetaFunctions.hpp>
+#include<TestAggregate.hpp>
+#include<TestMemoryPool.hpp>
+#include<TestCXX11.hpp>
+#include<TestTile.hpp>
+
+#include<TestViewCtorPropEmbeddedDim.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_RangePolicy.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_RangePolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b671929d848936fef5144f1602277c88a4a6d968
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_RangePolicy.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Reductions.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Reductions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0720af0fda03c910561a2773a01edcbd39ab5641
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Reductions.cpp
@@ -0,0 +1,48 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestFunctorAnalysis.hpp>
+#include <TestReduce.hpp>
+#include <TestCXX11Deduction.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Scan.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Scan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..57f9857a8723a19330cab94ed37a5892528d1f00
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Scan.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestScan.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SharedAlloc.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SharedAlloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc0b02505deb51841fc0507b02ce21c9246d32cd
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SharedAlloc.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestSharedAlloc.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::CudaSpace, Kokkos::DefaultHostExecutionSpace >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f7bfdc67873e2bec8d15dae017d30ed5f7e4c9b8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
@@ -0,0 +1,386 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cuda/TestCuda_Category.hpp>
+
+namespace Test {
+
+__global__
+void test_abort()
+{
+  Kokkos::abort( "test_abort" );
+}
+
+__global__
+void test_cuda_spaces_int_value( int * ptr )
+{
+  if ( *ptr == 42 ) { *ptr = 2 * 42; }
+}
+
+TEST_F( cuda, space_access )
+{
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::HostSpace >::assignable, "" );
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaSpace >::accessible, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaUVMSpace >::assignable, "" );
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaUVMSpace >::accessible, "" );
+
+  //--------------------------------------
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaSpace >::assignable, "" );
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaUVMSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace >::accessible, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::HostSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::HostSpace >::accessible, "" );
+
+  //--------------------------------------
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaUVMSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaSpace >::assignable, "" );
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaSpace >::accessible, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::HostSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::HostSpace >::accessible, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace >::accessible, "" );
+
+  //--------------------------------------
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace >::assignable, "" );
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace >::accessible, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaSpace >::accessible, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace >::assignable, "" );
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace >::accessible, "" );
+
+  //--------------------------------------
+
+  static_assert(
+    ! Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::HostSpace >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::CudaSpace >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::CudaUVMSpace >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::CudaHostPinnedSpace >::accessible, "" );
+
+  static_assert(
+    ! Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, Kokkos::CudaSpace >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, Kokkos::CudaUVMSpace >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, Kokkos::CudaHostPinnedSpace >::accessible, "" );
+
+  static_assert(
+    std::is_same< Kokkos::Impl::HostMirror< Kokkos::CudaSpace >::Space
+                , Kokkos::HostSpace >::value, "" );
+
+  static_assert(
+    std::is_same< Kokkos::Impl::HostMirror< Kokkos::CudaUVMSpace >::Space
+                , Kokkos::Device< Kokkos::HostSpace::execution_space
+                                , Kokkos::CudaUVMSpace > >::value, "" );
+
+  static_assert(
+    std::is_same< Kokkos::Impl::HostMirror< Kokkos::CudaHostPinnedSpace >::Space
+                , Kokkos::CudaHostPinnedSpace >::value, "" );
+
+  static_assert(
+    std::is_same< Kokkos::Device< Kokkos::HostSpace::execution_space
+                                , Kokkos::CudaUVMSpace >
+                , Kokkos::Device< Kokkos::HostSpace::execution_space
+                                , Kokkos::CudaUVMSpace > >::value, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility
+      < Kokkos::Impl::HostMirror< Kokkos::Cuda >::Space
+      , Kokkos::HostSpace
+      >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility
+      < Kokkos::Impl::HostMirror< Kokkos::CudaSpace >::Space
+      , Kokkos::HostSpace
+      >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility
+      < Kokkos::Impl::HostMirror< Kokkos::CudaUVMSpace >::Space
+      , Kokkos::HostSpace
+      >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility
+      < Kokkos::Impl::HostMirror< Kokkos::CudaHostPinnedSpace >::Space
+      , Kokkos::HostSpace
+      >::accessible, "" );
+}
+
+TEST_F( cuda, uvm )
+{
+  if ( Kokkos::CudaUVMSpace::available() ) {
+    int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >( "uvm_ptr", sizeof( int ) );
+
+    *uvm_ptr = 42;
+
+    Kokkos::Cuda::fence();
+    test_cuda_spaces_int_value<<< 1, 1 >>>( uvm_ptr );
+    Kokkos::Cuda::fence();
+
+    EXPECT_EQ( *uvm_ptr, int( 2 * 42 ) );
+
+    Kokkos::kokkos_free< Kokkos::CudaUVMSpace >( uvm_ptr );
+  }
+}
+
+TEST_F( cuda, uvm_num_allocs )
+{
+  // The max number of UVM allocations allowed is 65536.
+  #define MAX_NUM_ALLOCS 65536
+
+  if ( Kokkos::CudaUVMSpace::available() ) {
+    struct TestMaxUVMAllocs {
+
+      using view_type         = Kokkos::View< double*, Kokkos::CudaUVMSpace >;
+      using view_of_view_type = Kokkos::View< view_type[ MAX_NUM_ALLOCS ]
+                                            , Kokkos::CudaUVMSpace >;
+
+      TestMaxUVMAllocs() : view_allocs_test( "view_allocs_test" )
+      {
+        for ( auto i = 0; i < MAX_NUM_ALLOCS; ++i ) {
+
+          // Kokkos will throw a runtime exception if an attempt is made to
+          // allocate more than the maximum number of uvm allocations.
+
+          // In this test, the max num of allocs occurs when i = MAX_NUM_ALLOCS - 1
+          // since the 'outer' view counts as one UVM allocation, leaving
+          // 65535 possible UVM allocations, that is 'i in [0, 65535)'.
+
+          // The test will catch the exception thrown in this case and continue.
+
+          if ( i == ( MAX_NUM_ALLOCS - 1 ) ) {
+            EXPECT_ANY_THROW( { view_allocs_test( i ) = view_type( "inner_view", 1 ); } );
+          }
+          else {
+            if ( i < MAX_NUM_ALLOCS - 1000 ) {
+              EXPECT_NO_THROW( { view_allocs_test( i ) = view_type( "inner_view", 1 ); } );
+            } else { // This might or might not throw depending on compilation options.
+              try {
+                view_allocs_test( i ) = view_type( "inner_view", 1 );
+              }
+              catch ( ... ) {}
+            }
+          }
+
+        } // End allocation for loop.
+
+        for ( auto i = 0; i < MAX_NUM_ALLOCS - 1; ++i ) {
+
+          view_allocs_test( i ) = view_type();
+
+        } // End deallocation for loop.
+
+        view_allocs_test = view_of_view_type(); // Deallocate the view of views.
+      }
+
+      // Member.
+      view_of_view_type view_allocs_test;
+    };
+
+    // Trigger the test via the TestMaxUVMAllocs constructor.
+    TestMaxUVMAllocs();
+  }
+
+  #undef MAX_NUM_ALLOCS
+}
+
+template< class MemSpace, class ExecSpace >
+struct TestViewCudaAccessible {
+  enum { N = 1000 };
+
+  using V = Kokkos::View< double*, MemSpace >;
+
+  V m_base;
+
+  struct TagInit {};
+  struct TagTest {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit &, const int i ) const { m_base[i] = i + 1; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagTest &, const int i, long & error_count ) const
+  { if ( m_base[i] != i + 1 ) ++error_count; }
+
+  TestViewCudaAccessible()
+    : m_base( "base", N )
+    {}
+
+  static void run()
+  {
+    TestViewCudaAccessible self;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space, TagInit >( 0, N ), self );
+    MemSpace::execution_space::fence();
+
+    // Next access is a different execution space, must complete prior kernel.
+    long error_count = -1;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TagTest >( 0, N ), self, error_count );
+    EXPECT_EQ( error_count, 0 );
+  }
+};
+
+TEST_F( cuda, impl_view_accessible )
+{
+  TestViewCudaAccessible< Kokkos::CudaSpace, Kokkos::Cuda >::run();
+
+  TestViewCudaAccessible< Kokkos::CudaUVMSpace, Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaUVMSpace, Kokkos::HostSpace::execution_space >::run();
+
+  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace::execution_space >::run();
+}
+
+template< class MemSpace >
+struct TestViewCudaTexture {
+  enum { N = 1000 };
+
+  using V = Kokkos::View< double*, MemSpace >;
+  using T = Kokkos::View< const double*, MemSpace, Kokkos::MemoryRandomAccess >;
+
+  V m_base;
+  T m_tex;
+
+  struct TagInit {};
+  struct TagTest {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit &, const int i ) const { m_base[i] = i + 1; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagTest &, const int i, long & error_count ) const
+  { if ( m_tex[i] != i + 1 ) ++error_count; }
+
+  TestViewCudaTexture()
+    : m_base( "base", N )
+    , m_tex( m_base )
+    {}
+
+  static void run()
+  {
+    EXPECT_TRUE( ( std::is_same< typename V::reference_type, double & >::value ) );
+    EXPECT_TRUE( ( std::is_same< typename T::reference_type, const double >::value ) );
+
+    EXPECT_TRUE(  V::reference_type_is_lvalue_reference ); // An ordinary view.
+    EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value.
+
+    TestViewCudaTexture self;
+    Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda, TagInit >( 0, N ), self );
+
+    long error_count = -1;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda, TagTest >( 0, N ), self, error_count );
+    EXPECT_EQ( error_count, 0 );
+  }
+};
+
+TEST_F( cuda, impl_view_texture )
+{
+  TestViewCudaTexture< Kokkos::CudaSpace >::run();
+  TestViewCudaTexture< Kokkos::CudaUVMSpace >::run();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..805da988c6261f693da4fbb3277d41f98e9e3deb
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp
@@ -0,0 +1,104 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_assign_strided )
+{
+  TestViewSubview::test_1d_strided_assignment< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_0 )
+{
+  TestViewSubview::test_left_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_1 )
+{
+  TestViewSubview::test_left_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_2 )
+{
+  TestViewSubview::test_left_2< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_3 )
+{
+  TestViewSubview::test_left_3< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_0 )
+{
+  TestViewSubview::test_right_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_1 )
+{
+  TestViewSubview::test_right_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_3 )
+{
+  TestViewSubview::test_right_3< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2018fc5c8f480bbe26266db432050b0b8fdb6a9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp
@@ -0,0 +1,63 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_layoutleft_to_layoutleft )
+{
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_layoutright_to_layoutright )
+{
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..946de0e111db0b2983b1f9cce599a6864266a57c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a1c895583336dac104631ece45145aa3d3b59f6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c27d66a835ce94948bc3c3b9a5226bb93e2f476
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c04.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c04.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..704bab5a0d9a90d6c51cf5dcb11059d34aaa0b32
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c04.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7510ba2afc607cfbaca569a028afcac9b7c79534
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b580f7dcd57c40666cc526f53345fe8f828c125
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d8484c352ef803a57cfde8236a7edd348179560
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e896470e85c8b044980b0b02f0caf03a01870523
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..61c8e0ff73d218620af1febe7700cbd8ccb3e0da
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9ce886295fd18e0287016c69c643b9997788242a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b49f3694697346effdb47af40230ff50ca50f2e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8132cb3469196cfd46f1d89dfb22cf7a5b00e0ee
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c13.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c13.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f397eb6b7b344adad1d4e954ed72cdbbb723fb13
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c13.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCudaUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
+{
+  TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1210307c76d60a927fd361ca85dd096ef60893c2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp
@@ -0,0 +1,13 @@
+#include <cuda/TestCuda_SubView_c01.cpp>
+#include <cuda/TestCuda_SubView_c02.cpp>
+#include <cuda/TestCuda_SubView_c03.cpp>
+#include <cuda/TestCuda_SubView_c04.cpp>
+#include <cuda/TestCuda_SubView_c05.cpp>
+#include <cuda/TestCuda_SubView_c06.cpp>
+#include <cuda/TestCuda_SubView_c07.cpp>
+#include <cuda/TestCuda_SubView_c08.cpp>
+#include <cuda/TestCuda_SubView_c09.cpp>
+#include <cuda/TestCuda_SubView_c10.cpp>
+#include <cuda/TestCuda_SubView_c11.cpp>
+#include <cuda/TestCuda_SubView_c12.cpp>
+#include <cuda/TestCuda_SubView_c13.cpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Task.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d93266981977e864ff050172cb90f624aafde34
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Task.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestTaskScheduler.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Team.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Team.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c633585096b8711e8d93e2a83fe92a1a3edc3e4b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Team.cpp
@@ -0,0 +1,75 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_for )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+}
+
+
+TEST_F( TEST_CATEGORY, team_reduce )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
+}
+}
+
+#include <TestTeamVector.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_TeamReductionScan.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_TeamReductionScan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f84c964f8f71598fd8b353f210e8c23afede53c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_TeamReductionScan.cpp
@@ -0,0 +1,82 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+#if !defined(KOKKOS_CUDA_CLANG_WORKAROUND)
+TEST_F( TEST_CATEGORY, team_scan )
+{
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+}
+#endif
+
+TEST_F( TEST_CATEGORY, team_long_reduce )
+{
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( TEST_CATEGORY, team_double_reduce )
+{
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_TeamScratch.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_TeamScratch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..879633b0c85834fe06268e01639e01e15d8c77f4
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_TeamScratch.cpp
@@ -0,0 +1,83 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_shared_request )
+{
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+TEST_F( TEST_CATEGORY, team_scratch_request )
+{
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+TEST_F( TEST_CATEGORY, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+#endif
+#endif
+
+TEST_F( TEST_CATEGORY, shmem_size )
+{
+  TestShmemSize< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a04ee57a96bef70fc275f1558c8b1e576e1d4df1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestUniqueToken.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b77ab399d5c2293b15597946497909fe9815cc2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestViewAPI.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8e3d18fc0a31632c7b2e0f539d381aea10bcf90
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_a.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestViewMapping_a.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a3b6c0bebef58ffd7be6d26ea2463ef3407e8af
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_b.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestViewMapping_b.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_subview.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..caddacb07b2de84bbf95de874dbeb2fb92fe9166
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_ViewMapping_subview.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestViewMapping_subview.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_ViewOfClass.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_ViewOfClass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c457098ed52f2737c04f69e3e1ac5162661e0e1e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_ViewOfClass.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cuda/TestCuda_Category.hpp>
+#include <TestViewOfClass.hpp>
+
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c51f95810a6aff86aca67ef7fb4cdcffc68d4e48
--- /dev/null
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestWorkGraph.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..515cd5412923fb3eb0269caa6c5f06cca1171e2f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
@@ -0,0 +1,72 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <default/TestDefaultDeviceType_Category.hpp>
+
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, host_space_access )
+{
+  typedef Kokkos::HostSpace::execution_space host_exec_space;
+  typedef Kokkos::Device< host_exec_space, Kokkos::HostSpace > device_space;
+  typedef Kokkos::Impl::HostMirror< Kokkos::DefaultExecutionSpace >::Space mirror_space;
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility< host_exec_space, Kokkos::HostSpace >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility< device_space, Kokkos::HostSpace >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility< mirror_space, Kokkos::HostSpace >::accessible, "" );
+}
+
+} // namespace Test
+
+#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..40a773b3b8fd18fb0a4cce396b4cc19400b9ad41
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f12c4f62b25acbb44e1f7d58876884035c250d9f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7ffd7b94e5675b28d519e5dc785ccfb55549b31
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..24e2b152014a8308e1ef3eccaa44ad76d884f9d2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7968c13b661cad0b54697e86626d166fe0949602
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab0563c6dc03d45fc696ea538cb75d6288f1e576
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70a8ca1727515910f5bae07703421e9e95e6ab42
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..727c7a95eb9f949f6ecb0e910dc8ff009d6b8225
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..88fba34c50e93c2ddb8e730d50e08d853b44dba5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b3562cc53d6b4cf2a4162b916d84f94e1ab482a6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d4983319cb565f2ba4283b910bd16cabc48253a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..026fb01f8870af1a2d24f59da17a5d419721ba71
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..937a13160e40f6ec4666a7f4cb7eb7dc62d8a8fc
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..992c854c1a10224a09d897a917e309f654cd4763
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07a8b1cb7c2ea3fc515f2f403ad6401353d7f7a1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d8c05be2d7f486487d7f39357982361117b4b76
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd8224ce34669c18e9cc9cc7e39e620a0a09c303
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp
@@ -0,0 +1,57 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include "TestResize.hpp"
+
+namespace Test {
+
+TEST( kokkosresize, host_space_access )
+{
+  // Test with the default device type.
+  using TestViewResize::testResize;
+  typedef Kokkos::View<int*>::device_type device_type;
+  testResize<device_type> ();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_Category.hpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e981dcb7991eba4a7901843ccb239dce5631242d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_Category.hpp
@@ -0,0 +1,67 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class defaultdevicetype : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+  }
+
+  static void TearDownTestCase()
+  {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY defaultdevicetype
+#define TEST_EXECSPACE Kokkos::DefaultExecutionSpace
+
+#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c8d21f38026a4d0c9c75ee8a274ffd3524f19f02
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_a.cpp
@@ -0,0 +1,63 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
+
+#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestReduceCombinatorical.hpp>
+
+namespace Test {
+
+
+TEST_F( defaultdevicetype, reduce_instantiation_a )
+{
+  TestReduceCombinatoricalInstantiation<>::execute_a();
+}
+
+} // namespace Test
+
+#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..131e79271aaa6f6a56ae8ab54eb9a7e5d0ccbfee
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_b.cpp
@@ -0,0 +1,62 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
+
+#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestReduceCombinatorical.hpp>
+
+namespace Test {
+
+TEST_F( defaultdevicetype, reduce_instantiation_b )
+{
+  TestReduceCombinatoricalInstantiation<>::execute_b();
+}
+
+} // namespace Test
+
+#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2af50101f42f1093fef62a332cea3fe80deb62b7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_c.cpp
@@ -0,0 +1,64 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
+#if !defined( KOKKOS_ENABLE_ROCM ) 
+
+#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestReduceCombinatorical.hpp>
+
+namespace Test {
+
+TEST_F( defaultdevicetype, reduce_instantiation_c )
+{
+  TestReduceCombinatoricalInstantiation<>::execute_c();
+}
+
+} // namespace Test
+
+#endif
+#endif
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..21a4df25ecdba28895a646b5c1772e3914e05cb2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
@@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
+
+#include <default/TestDefaultDeviceType_Category.hpp>
+#include <TestUtilities.hpp>
+
+namespace Test {
+
+TEST_F( defaultdevicetype, test_utilities )
+{
+  test_utilities();
+}
+
+TEST_F( defaultdevicetype, malloc )
+{
+  int* data = (int*) Kokkos::kokkos_malloc( 100 * sizeof( int ) );
+  ASSERT_NO_THROW( data = (int*) Kokkos::kokkos_realloc( data, 120 * sizeof( int ) ) );
+  Kokkos::kokkos_free( data );
+
+  int* data2 = (int*) Kokkos::kokkos_malloc( 0 );
+  ASSERT_TRUE( data2 == NULL );
+  Kokkos::kokkos_free( data2 );
+}
+
+} // namespace Test
+
+#endif
diff --git a/packages/kokkos/core/unit_test/diffconfig.sh b/packages/kokkos/core/unit_test/diffconfig.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0c8836ff83ca93d5293a986fb68f3a05b2291f51
--- /dev/null
+++ b/packages/kokkos/core/unit_test/diffconfig.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# CMake and Make tests run in separate directories
+#   The mapping of ARCH to #define is very complicated
+#       so diff is used instead of grepping
+if test "`basename $PWD`" = "cmaketest"; then 
+  outfile=$1
+  resfile=../results/$1
+else
+  outfile=config/tmpstore/$1
+  resfile=config/results/$1
+fi
+
+diff=`diff $outfile $resfile 2>&1 | grep -e define -e "such file"`
+if test -z "$diff"; then 
+  echo Passed
+else
+  echo Failed: $diff
+fi
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9fdf627001b3272ac172dbd85c49e88f0c44a30d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP.hpp
@@ -0,0 +1,114 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_OPENMP_HPP
+#define KOKKOS_TEST_OPENMP_HPP
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+#include <TestTile.hpp>
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+#include <TestViewAPI.hpp>
+#include <TestViewOfClass.hpp>
+#include <TestViewSubview.hpp>
+#include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
+#include <TestAtomicViews.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestTaskScheduler.hpp>
+#include <TestMemoryPool.hpp>
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+#include <TestPolicyConstruction.hpp>
+#include <TestMDRange.hpp>
+#include <TestConcurrentBitset.hpp>
+
+namespace Test {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    int threads_count = 0;
+    #pragma omp parallel
+    {
+      #pragma omp atomic
+      ++threads_count;
+    }
+
+    if (threads_count > 3) {
+      threads_count /= 2;
+    }
+
+    Kokkos::OpenMP::initialize( threads_count );
+    Kokkos::print_configuration( std::cout, true );
+
+    srand( 10231 );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::OpenMP::finalize();
+  }
+};
+
+} // namespace Test
+
+#endif
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d0f99ba032b85591b4db50ccca0f7290545be28
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_AtomicOperations.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestAtomicOperations.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_AtomicViews.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_AtomicViews.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e7355ff59560771300d0ee12d63e656c22f1eb9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_AtomicViews.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestAtomicViews.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ea062b4c0a73df9727a894204f21db4f123feddc
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestAtomic.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Category.hpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..76391a6d963a3c83afe43e36c34602a86f942aa3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Category.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY openmp
+#define TEST_EXECSPACE Kokkos::OpenMP
+
+#endif
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Complex.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Complex.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9040ea56369171fd2f4ea95faa4113b6053c7c1d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Complex.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestComplex.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Crs.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Crs.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f97bb95d1c0ec1c0240a7eb668267c4b3588b33
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Crs.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestCrs.hpp>
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Init.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Init.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48c033a735d8e4ce2711751c871e121591902301
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Init.cpp
@@ -0,0 +1,50 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestInit.hpp>
+#include<TestCompilerMacros.hpp>
+#include<TestPolicyConstruction.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..148366bc0d2da9602dbee8ccec123e3d9cb27de4
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_InterOp.cpp
@@ -0,0 +1,90 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <openmp/TestOpenMP_Category.hpp>
+#include <omp.h>
+
+namespace Test {
+
+// Test whether allocations survive Kokkos initialize/finalize if done via Raw Cuda.
+TEST_F( openmp, raw_openmp_interop )
+{
+  int count = 0;
+  int num_threads,concurrency;
+  #pragma omp parallel
+  {
+    #pragma omp atomic
+    count++;
+    if(omp_get_thread_num()==0)
+      num_threads = omp_get_num_threads();
+  }
+
+  ASSERT_EQ(count,num_threads);
+
+  Kokkos::InitArguments arguments{-1,-1,-1, false};
+  Kokkos::initialize(arguments);
+
+  count = 0;
+  #pragma omp parallel
+  {
+    #pragma omp atomic
+    count++;
+  }
+
+  concurrency = Kokkos::OpenMP::concurrency();
+  ASSERT_EQ(count,concurrency);
+
+  Kokkos::finalize();
+
+  count = 0;
+  #pragma omp parallel
+  {
+    #pragma omp atomic
+    count++;
+  }
+
+  ASSERT_EQ(count,concurrency);
+
+}
+}
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_MDRange.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_MDRange.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..979a6a4476b31d1ad7865d0622ad9d8c8ca9d84d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_MDRange.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestMDRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..989f971c09ee8b5f743b32910cba7ed71e090238
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp
@@ -0,0 +1,140 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestTemplateMetaFunctions.hpp>
+#include<TestAggregate.hpp>
+#include<TestMemoryPool.hpp>
+#include<TestCXX11.hpp>
+#include<TestTile.hpp>
+
+#include<TestViewCtorPropEmbeddedDim.hpp>
+
+#include <mutex>
+
+namespace Test {
+
+TEST_F( openmp, partition_master )
+{
+  using Mutex = Kokkos::Experimental::MasterLock<Kokkos::OpenMP>;
+
+  Mutex mtx;
+  int errors = 0;
+
+  auto master = [&errors, &mtx](int partition_id, int num_partitions) {
+
+    const int pool_size = Kokkos::OpenMP::thread_pool_size();
+
+    {
+      std::unique_lock<Mutex> lock(mtx);
+      if ( Kokkos::OpenMP::in_parallel() ) {
+        ++errors;
+      }
+      if ( Kokkos::OpenMP::thread_pool_rank() != 0 ) {
+        ++errors;
+      }
+    }
+
+    {
+      int local_errors = 0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<Kokkos::OpenMP>(0,1000)
+                           , [pool_size]( const int , int & errs ) {
+          if ( Kokkos::OpenMP::thread_pool_size() != pool_size ) {
+            ++errs;
+          }
+        }
+        , local_errors
+      );
+      Kokkos::atomic_add( &errors, local_errors );
+    }
+
+    Kokkos::Experimental::UniqueToken< Kokkos::OpenMP > token;
+
+    Kokkos::View<int*, Kokkos::OpenMP> count( "",  token.size() );
+
+    Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::OpenMP>(0,1000),
+        [=] ( const int ) {
+      int i = token.acquire();
+      ++count[i];
+      token.release(i);
+    });
+
+    Kokkos::View<int,Kokkos::OpenMP> sum ("");
+    Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::OpenMP>(0,token.size()),
+        [=] ( const int i ) {
+      Kokkos::atomic_add( sum.data(), count[i] );
+    });
+
+    if (sum() != 1000) {
+      Kokkos::atomic_add( &errors, 1 );
+    }
+  };
+
+  master(0,1);
+
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 4, 0 );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 0, 4 );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 2, 2 );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 8, 0 );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 0, 8 );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 8, 8 );
+  ASSERT_EQ( errors, 0 );
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_RangePolicy.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_RangePolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fd097c84d7e27508402aab1538323d89d3cd4c8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_RangePolicy.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ff39c01db0da82acdc089177aa23769ce1c6171
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp
@@ -0,0 +1,47 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestReduce.hpp>
+#include <TestCXX11Deduction.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Scan.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Scan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04a38163a49da8e47da374d8302380525916d94e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Scan.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestScan.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SharedAlloc.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SharedAlloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82e9cf6e33360a497888624c657f3d835d4b25dc
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SharedAlloc.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestSharedAlloc.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::HostSpace, TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..524836d20b70f39192b84977ae5c8687a2d1aeb7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp
@@ -0,0 +1,104 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_assign_strided )
+{
+  TestViewSubview::test_1d_strided_assignment< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_0 )
+{
+  TestViewSubview::test_left_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_1 )
+{
+  TestViewSubview::test_left_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_2 )
+{
+  TestViewSubview::test_left_2< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_3 )
+{
+  TestViewSubview::test_left_3< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_0 )
+{
+  TestViewSubview::test_right_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_1 )
+{
+  TestViewSubview::test_right_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_3 )
+{
+  TestViewSubview::test_right_3< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3bf63b641ae440cf75c114e908099ba087c990a3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp
@@ -0,0 +1,63 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_layoutleft_to_layoutleft )
+{
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_layoutright_to_layoutright )
+{
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..676ef18a38c31ec643df42992395ea9253896c03
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1af02c73ad01e240e9a0c7d64fda8282b3e3967
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5dc8b4f84e644d7f722826d91a1a42f6d7ae261
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c008a6bb223d9ed6ae52919c7aa12f72ddaefca
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f43cbc8770ec518dfed29f43acd2a84405253a1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( openmp, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fbb55ee7c75f1a2322d9f24f4ce97fbb128c4423
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..503997cfcae2d1c111915c571b219de8657c95e9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..34133feb1b4ddb2870ca253a841b081bdfd3af6c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d70cb0bd2577cc3ae0f07030e4d707df2e9b9c0a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72bbb2c36bde850ad6169c6dd640477bc48fe76e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f715f70e771de71f618262f90c0b5d0c1c0fa3c1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c93581112a1339f5422508d4f3761e7626a2c1a8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c13.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c13.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..594c62d1453beba6fe5098d6fad48f0150e26d16
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c13.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
+{
+  TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5bf3626de967cbc15b18fd3c964f0d702446d334
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp
@@ -0,0 +1,13 @@
+#include <openmp/TestOpenMP_SubView_c01.cpp>
+#include <openmp/TestOpenMP_SubView_c02.cpp>
+#include <openmp/TestOpenMP_SubView_c03.cpp>
+#include <openmp/TestOpenMP_SubView_c04.cpp>
+#include <openmp/TestOpenMP_SubView_c05.cpp>
+#include <openmp/TestOpenMP_SubView_c06.cpp>
+#include <openmp/TestOpenMP_SubView_c07.cpp>
+#include <openmp/TestOpenMP_SubView_c08.cpp>
+#include <openmp/TestOpenMP_SubView_c09.cpp>
+#include <openmp/TestOpenMP_SubView_c10.cpp>
+#include <openmp/TestOpenMP_SubView_c11.cpp>
+#include <openmp/TestOpenMP_SubView_c12.cpp>
+#include <openmp/TestOpenMP_SubView_c13.cpp>
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Task.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..95a6cbdf9b68fc29f8e53c1189167baeb848e039
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Task.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestTaskScheduler.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..790ea9e6da2974ed2d1081d313b4e00cd48b949e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp
@@ -0,0 +1,75 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_for )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+}
+
+
+TEST_F( TEST_CATEGORY, team_reduce )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
+}
+}
+
+#include <TestTeamVector.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_TeamReductionScan.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_TeamReductionScan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..482005c8838a59a4e8f14ab1f36b7daed8f0331d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_TeamReductionScan.cpp
@@ -0,0 +1,81 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, team_scan )
+{
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+}
+
+TEST_F( TEST_CATEGORY, team_long_reduce )
+{
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( TEST_CATEGORY, team_double_reduce )
+{
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_TeamScratch.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_TeamScratch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f57da139a62614a5aa1dace9244f697c10514c6e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_TeamScratch.cpp
@@ -0,0 +1,83 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_shared_request )
+{
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+TEST_F( TEST_CATEGORY, team_scratch_request )
+{
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+TEST_F( TEST_CATEGORY, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+#endif
+#endif
+
+TEST_F( TEST_CATEGORY, shmem_size )
+{
+  TestShmemSize< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..473ec73a2f3cc860b386d13dd9f8e969c6aca9fc
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestUniqueToken.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e7a05590ea32e38a09b671146c1592d88b936b4
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewAPI.hpp>
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..781dabeeed5420c8eb3ddae65e90634a0bd3e87a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_a.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewMapping_a.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..910d043fb38d0b28ba2c3582d4dd366ac059bd95
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_b.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewMapping_b.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_subview.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..900e760979e3df2815ee9c40ffcacdd089613e0d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewMapping_subview.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewMapping_subview.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewOfClass.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewOfClass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..87a49902f438fe677f78190b16cb6216bd647346
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_ViewOfClass.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmp/TestOpenMP_Category.hpp>
+#include <TestViewOfClass.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..916257e2036dd9f400e85b4196c7243fe630ba42
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestWorkGraph.hpp>
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b08320c07f8ba6e0abfe805d1fac76947cf1cf6e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget.hpp
@@ -0,0 +1,111 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+#include <TestTile.hpp>
+//#include <TestSharedAlloc.hpp>
+//#include <TestViewAPI.hpp>
+//#include <TestViewOfClass.hpp>
+//#include <TestViewSubview.hpp>
+//#include <TestAtomic.hpp>
+//#include <TestAtomicOperations.hpp>
+//#include <TestAtomicViews.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+//#include <TestReduce.hpp>
+//#include <TestScan.hpp>
+//#include <TestAggregate.hpp>
+//#include <TestCompilerMacros.hpp>
+
+//TODO enable task scheduler tests for openmptarget
+//#include <TestTaskScheduler.hpp>
+
+//#include <TestMemoryPool.hpp>
+//#include <TestCXX11.hpp>
+//#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+//#include <TestTemplateMetaFunctions.hpp>
+//#include <TestPolicyConstruction.hpp>
+//#include <TestMDRange.hpp>
+
+namespace Test {
+
+class openmptarget : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+    const unsigned openmptarget_per_core = Kokkos::hwloc::get_available_openmptarget_per_core();
+
+    unsigned openmptarget_count = 0;
+
+    openmptarget_count = std::max( 1u, numa_count )
+                  * std::max( 2u, cores_per_numa * openmptarget_per_core );
+
+    Kokkos::OpenMPTarget::initialize( openmptarget_count );
+    Kokkos::print_configuration( std::cout, true /* detailed */ );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::OpenMPTarget::finalize();
+  }
+};
+
+} // namespace Test
+
+#endif
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e2d2e95e400a3edac5126c04c6e3fccb449a64f2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicOperations.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmptarget/TestOpenMPTarget_Category.hpp>
+#include<TestAtomicOperations.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicViews.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicViews.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a9232540e888e7b2c79769b372e496e37854d8d6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_AtomicViews.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmptarget/TestOpenMPTarget_Category.hpp>
+#include<TestAtomicViews.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Atomics.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Atomics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8285bc97f71a8a41973a719d9846e80f52e61dd0
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Atomics.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestAtomic.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Category.hpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b9d943944266876da4e4a2f4eb4dcd56c430448
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Category.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class openmptarget : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY openmptarget
+#define TEST_EXECSPACE Kokkos::Experimental::OpenMPTarget
+
+#endif
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Complex.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Complex.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d0525f1b05fe10b1ea05a8b0f7e323ec830c3e6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Complex.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmptarget/TestOpenMPTarget_Category.hpp>
+#include<TestComplex.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Init.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Init.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..957bfe359a623d3f386a6b6c9a8b6a20807e051e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Init.cpp
@@ -0,0 +1,50 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmptarget/TestOpenMPTarget_Category.hpp>
+#include<TestInit.hpp>
+#include<TestCompilerMacros.hpp>
+#include<TestPolicyConstruction.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_MDRange.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_MDRange.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8b5febbf70ddd9df9bd1149c19efe0f6a1cddc3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_MDRange.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmptarget/TestOpenMPTarget_Category.hpp>
+#include<TestMDRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Other.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Other.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fce37dd719ac3ea76303fe6e955ac2bccb18ed59
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Other.cpp
@@ -0,0 +1,50 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmptarget/TestOpenMPTarget_Category.hpp>
+#include<TestTemplateMetaFunctions.hpp>
+#include<TestAggregate.hpp>
+#include<TestMemoryPool.hpp>
+#include<TestCXX11.hpp>
+#include<TestTile.hpp>
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_RangePolicy.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_RangePolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fea6a32b4745d082b4254c1ce74b468f82ac909
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_RangePolicy.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmptarget/TestOpenMPTarget_Category.hpp>
+#include<TestRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Reductions.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Reductions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1052fea30175f25decbcf66c6ad27c9a1a94cebf
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Reductions.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestReduce.hpp>
+#include <TestCXX11Deduction.hpp>
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Scan.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Scan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..56fc1a00a529b8650039d7ca42b2f93389629d13
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Scan.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmptarget/TestOpenMPTarget_Category.hpp>
+#include<TestScan.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SharedAlloc.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SharedAlloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e3e8ac281455f32a2f0b54dded26811d9ee729a3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SharedAlloc.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestSharedAlloc.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::Experimental::OpenMPTargetSpace, Kokkos::DefaultHostExecutionSpace >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_a.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df809ae2a911508cc7e1a22a18850cc1ca6b424f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_a.cpp
@@ -0,0 +1,104 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_assign_strided )
+{
+  TestViewSubview::test_1d_strided_assignment< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_0 )
+{
+  TestViewSubview::test_left_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_1 )
+{
+  TestViewSubview::test_left_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_2 )
+{
+  TestViewSubview::test_left_2< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_3 )
+{
+  TestViewSubview::test_left_3< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_0 )
+{
+  TestViewSubview::test_right_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_1 )
+{
+  TestViewSubview::test_right_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_3 )
+{
+  TestViewSubview::test_right_3< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_b.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b84bc623fbb0e8325c9d1335704aba462e9162e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_b.cpp
@@ -0,0 +1,63 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_layoutleft_to_layoutleft )
+{
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_layoutright_to_layoutright )
+{
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c01.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c01.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b8d3fd5e75ed38fca7968bcce231a9de2feeeae
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c01.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c02.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c02.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb1d02436769a61fb2ba6ac73e03390a93ba63af
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c02.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c03.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c03.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ec31b3688e6c4018970f865faf03d5dbdbb56f36
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c03.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c04.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c04.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c1db2cf0ec44bd069024e6fcc5d00a63c71e705e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c04.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c05.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c05.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4d30f68b80a68ba4431db610c99e6ad104fdc4c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c05.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( openmptarget, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c06.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c06.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..34b4dfb4092f84614a0cc4bfa3ba2b29a3b9dfef
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c06.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c07.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c07.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50ac0b06acbe83438df054153545b30ffdf14ac4
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c07.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c08.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c08.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5da4797472cc8d3c7e19f67c102abc5a575844d3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c08.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c09.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c09.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9041e5121dc7782aa9e6a2133bc2cf36f9aa3725
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c09.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c10.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c10.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3887d2b70820002469900f2e6bf7cdb15b544eec
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c10.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c11.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e36841887ee074b9bcb13c934b2d3cfecb4a17c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c11.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c12.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c12.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dc33f1e0a924e66fc6ab23a287e2a7079de17209
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_SubView_c12.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Team.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Team.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4eef325ea1ef0f0684a3f1d3cdf1d66621295e46
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_Team.cpp
@@ -0,0 +1,75 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_for )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+}
+
+
+TEST_F( TEST_CATEGORY, team_reduce )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
+}
+}
+
+#include <TestTeamVector.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8ea6ce7aea1fa1ffd85a5a9cd15886d0b6885d6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp
@@ -0,0 +1,81 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, team_scan )
+{
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+}
+
+TEST_F( TEST_CATEGORY, team_long_reduce )
+{
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( TEST_CATEGORY, team_double_reduce )
+{
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_TeamScratch.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_TeamScratch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f13bfd2dcbec815696ec5c688ef4507cae14701f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_TeamScratch.cpp
@@ -0,0 +1,83 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_shared_request )
+{
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+TEST_F( TEST_CATEGORY, team_scratch_request )
+{
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+TEST_F( TEST_CATEGORY, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+#endif
+#endif
+
+TEST_F( TEST_CATEGORY, shmem_size )
+{
+  TestShmemSize< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewAPI_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc39a6135e6f3b392beff4697742f0b36ce3d7c1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewAPI_b.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewAPI.hpp>
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9a84fdc1fe7b0110a3c589a2d573224fb19ea18
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_a.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewMapping_a.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a749270ed1637569113aef6bfe70534c1c483ae6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_b.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewMapping_b.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_subview.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5dc6517529c7fd2de52cfa277d13cc8fa98c99ae
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewMapping_subview.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewMapping_subview.hpp>
+
diff --git a/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewOfClass.cpp b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewOfClass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d7c49ee46e9d0c8c88e6206eb2bba58743ca24b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/openmptarget/TestOpenMPTarget_ViewOfClass.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <openmptarget/TestOpenMPTarget_Category.hpp>
+#include <TestViewOfClass.hpp>
+
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads.hpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..989ea48ce679c43c108296c6ce6dde9a42302d95
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_QTHREADS_HPP
+#define KOKKOS_TEST_QTHREADS_HPP
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+#include <TestTile.hpp>
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+#include <TestViewAPI.hpp>
+#include <TestViewOfClass.hpp>
+#include <TestViewSubview.hpp>
+#include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
+#include <TestAtomicViews.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestTaskScheduler.hpp>
+#include <TestMemoryPool.hpp>
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+#include <TestPolicyConstruction.hpp>
+#include <TestMDRange.hpp>
+
+namespace Test {
+
+class qthreads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+    const unsigned threads_count = std::max( 1u, numa_count ) *
+                                   std::max( 2u, ( cores_per_numa * threads_per_core ) / 2 );
+
+    Kokkos::Qthreads::initialize( threads_count );
+    Kokkos::print_configuration( std::cout, true );
+
+    srand( 10231 );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Qthreads::finalize();
+  }
+};
+
+} // namespace Test
+
+#endif
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_Atomics.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Atomics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..199dbdcb9c47b9f596c78efee8acd58f177e6745
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Atomics.cpp
@@ -0,0 +1,213 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, atomics )
+{
+#if 0
+  const int loop_count = 1e4;
+
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Qthreads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Qthreads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Qthreads >( 100, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Qthreads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Qthreads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Qthreads >( 100, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Qthreads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Qthreads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Qthreads >( 100, 3 ) ) );
+#endif
+}
+
+TEST_F( qthreads, atomic_operations )
+{
+#if 0
+  const int start = 1; // Avoid zero for division.
+  const int end = 11;
+
+  for ( int i = start; i < end; ++i )
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, atomic_views_integral )
+{
+#if 0
+  const long length = 1000000;
+
+  {
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 8 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, atomic_views_nonintegral )
+{
+#if 0
+  const long length = 1000000;
+
+  {
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 4 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, atomic_view_api )
+{
+#if 0
+  TestAtomicViews::TestAtomicViewAPI< int, Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_Category.hpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4500863cb49ebfe3b763a61308764bff1db296a1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Category.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class qthreads : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY qthreads
+#define TEST_EXECSPACE Kokkos::Qthreads
+
+#endif
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_Complex.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Complex.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..799b8454f8d8922bd5a509ed819d3f3adda57c26
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Complex.cpp
@@ -0,0 +1,3 @@
+#include<qthreads/TestQthreads_Category.hpp>
+#include<TestComplex.hpp>
+
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_Other.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Other.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4c1ec227835eb581a4d5a73e116feeeea425f6c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Other.cpp
@@ -0,0 +1,209 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, init )
+{
+  ;
+}
+
+TEST_F( qthreads, md_range )
+{
+#if 0
+  TestMDRange_2D< Kokkos::Qthreads >::test_for2( 100, 100 );
+  TestMDRange_3D< Kokkos::Qthreads >::test_for3( 100, 100, 100 );
+#endif
+}
+
+TEST_F( qthreads, policy_construction )
+{
+#if 0
+  TestRangePolicyConstruction< Kokkos::Qthreads >();
+  TestTeamPolicyConstruction< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, range_tag )
+{
+#if 0
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 0 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 2 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 3 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 3 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 3 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 3 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 1000 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1001 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1001 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 1001 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 1000 );
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthreads, compiler_macros )
+{
+#if 0
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Qthreads >() ) );
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthreads, memory_pool )
+{
+#if 0
+
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+TEST_F( qthreads, task_fib )
+{
+#if 0
+  const int N = 24 ; // 25 triggers tbd bug on Cuda/Pascal
+  for ( int i = 0; i < N; ++i ) {
+    TestTaskScheduler::TestFib< Kokkos::Qthreads >::run( i, ( i + 1 ) * ( i + 1 ) * 10000 );
+  }
+#endif
+}
+
+TEST_F( qthreads, task_depend )
+{
+#if 0
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< Kokkos::Qthreads >::run( i );
+  }
+#endif
+}
+
+TEST_F( qthreads, task_team )
+{
+#if 0
+  TestTaskScheduler::TestTaskTeam< Kokkos::Qthreads >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Qthreads >::run( 1000 ); // Put back after testing.
+#endif
+}
+
+#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+
+TEST_F( qthreads, cxx11 )
+{
+#if 0
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Qthreads >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 4 ) ) );
+  }
+#endif
+}
+
+#endif
+
+TEST_F( qthreads, tile_layout )
+{
+#if 0
+  TestTile::test< Kokkos::Qthreads, 1, 1 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 1, 1 >( 2, 3 );
+  TestTile::test< Kokkos::Qthreads, 1, 1 >( 9, 10 );
+
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 2, 3 );
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 4, 4 );
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Qthreads, 2, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Qthreads, 4, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 4, 4 );
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 9, 11 );
+
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 4, 4 );
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 9, 9 );
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 9, 11 );
+#endif
+}
+
+TEST_F( qthreads, dispatch )
+{
+#if 0
+  const int repeat = 100;
+  for ( int i = 0; i < repeat; ++i ) {
+    for ( int j = 0; j < repeat; ++j ) {
+      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Qthreads >( 0, j )
+                          , KOKKOS_LAMBDA( int ) {} );
+    }
+  }
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_Reductions.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Reductions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f2e2c2339de91f31067dc807a16f76f0975b892a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Reductions.cpp
@@ -0,0 +1,168 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, long_reduce )
+{
+#if 0
+  TestReduce< long, Kokkos::Qthreads >( 0 );
+  TestReduce< long, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, double_reduce )
+{
+#if 0
+  TestReduce< double, Kokkos::Qthreads >( 0 );
+  TestReduce< double, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, reducers )
+{
+#if 0
+  TestReducers< int, Kokkos::Qthreads >::execute_integer();
+  TestReducers< size_t, Kokkos::Qthreads >::execute_integer();
+  TestReducers< double, Kokkos::Qthreads >::execute_float();
+  TestReducers< Kokkos::complex<double >, Kokkos::Qthreads>::execute_basic();
+#endif
+}
+
+TEST_F( qthreads, long_reduce_dynamic )
+{
+#if 0
+  TestReduceDynamic< long, Kokkos::Qthreads >( 0 );
+  TestReduceDynamic< long, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, double_reduce_dynamic )
+{
+#if 0
+  TestReduceDynamic< double, Kokkos::Qthreads >( 0 );
+  TestReduceDynamic< double, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, long_reduce_dynamic_view )
+{
+#if 0
+  TestReduceDynamicView< long, Kokkos::Qthreads >( 0 );
+  TestReduceDynamicView< long, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, scan )
+{
+#if 0
+  TestScan< Kokkos::Qthreads >::test_range( 1, 1000 );
+  TestScan< Kokkos::Qthreads >( 0 );
+  TestScan< Kokkos::Qthreads >( 100000 );
+  TestScan< Kokkos::Qthreads >( 10000000 );
+  Kokkos::Qthreads::fence();
+#endif
+}
+
+TEST_F( qthreads, scan_small )
+{
+#if 0
+  typedef TestScan< Kokkos::Qthreads, Kokkos::Impl::QthreadsExecUseScanSmall > TestScanFunctor;
+
+  for ( int i = 0; i < 1000; ++i ) {
+    TestScanFunctor( 10 );
+    TestScanFunctor( 10000 );
+  }
+  TestScanFunctor( 1000000 );
+  TestScanFunctor( 10000000 );
+
+  Kokkos::Qthreads::fence();
+#endif
+}
+
+TEST_F( qthreads, team_scan )
+{
+#if 0
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+#endif
+}
+
+TEST_F( qthreads, team_long_reduce )
+{
+#if 0
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+#endif
+}
+
+TEST_F( qthreads, team_double_reduce )
+{
+#if 0
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+#endif
+}
+
+TEST_F( qthreads, reduction_deduction )
+{
+#if 0
+  TestCXX11::test_reduction_deduction< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_a.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8fdee75d86b1c479b5cd4a97a84c892ff74c4971
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_a.cpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_auto_1d_left )
+{
+#if 0
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_auto_1d_right )
+{
+#if 0
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_auto_1d_stride )
+{
+#if 0
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_assign_strided )
+{
+#if 0
+  TestViewSubview::test_1d_strided_assignment< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_0 )
+{
+#if 0
+  TestViewSubview::test_left_0< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_1 )
+{
+#if 0
+  TestViewSubview::test_left_1< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_2 )
+{
+#if 0
+  TestViewSubview::test_left_2< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_3 )
+{
+#if 0
+  TestViewSubview::test_left_3< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_right_0 )
+{
+#if 0
+  TestViewSubview::test_right_0< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_right_1 )
+{
+#if 0
+  TestViewSubview::test_right_1< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_right_3 )
+{
+#if 0
+  TestViewSubview::test_right_3< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_b.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e32a42461bb78b37a5796eb4d4a2ca69819024a5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_b.cpp
@@ -0,0 +1,66 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_layoutleft_to_layoutleft )
+{
+#if 0
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Qthreads >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_layoutright_to_layoutright )
+{
+#if 0
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Qthreads >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c01.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c01.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a0ada2859d126858435ce0286b606890934056b4
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c01.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_1d_assign )
+{
+#if 0
+  TestViewSubview::test_1d_assign< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c02.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c02.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b0e09811347b63a515c643b8ac123e85e34997e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c02.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_1d_assign_atomic )
+{
+#if 0
+  TestViewSubview::test_1d_assign< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c03.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c03.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2062fbeaf023286c0d18857421f2a0948408087a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c03.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_1d_assign_randomaccess )
+{
+#if 0
+  TestViewSubview::test_1d_assign< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c04.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c04.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07b84de90e30fbe1422deaeb6e6b5349e7c600d1
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c04.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_2d_from_3d )
+{
+#if 0
+  TestViewSubview::test_2d_subview_3d< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c05.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c05.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d5d2a9915aafe02382b181ff75c70f774a2ba4c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c05.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_2d_from_3d_atomic )
+{
+#if 0
+  TestViewSubview::test_2d_subview_3d< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c06.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c06.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4330f00a6242a3aefe7a89c23eff6b81f5b6eb4d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c06.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_2d_from_3d_randomaccess )
+{
+#if 0
+  TestViewSubview::test_2d_subview_3d< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c07.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c07.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..154f5de69481574a9bd304593dc89b08c25a1cf4
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c07.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_left )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c08.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c08.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66832558de433055f4e8a74b106cfb591143d6cf
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c08.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_left_atomic )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c09.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c09.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5305ce1defe448030894dcb3e7dc9a0a56012b66
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c09.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_left_randomaccess )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c10.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c10.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..039414a6813b28bda02f277b7d9b1322ddb7edf5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c10.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_right )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c11.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05dcde77e2219158f1a6789bbf6f2a2c8271b92e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c11.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_right_atomic )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c12.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c12.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..58d79d609d1e13432ad63f36f7f20df0ac3a0cd5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c12.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_right_randomaccess )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c13.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c13.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..347c4a5b86a8b52c821f74a77adbbf1b08ba7817
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c13.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_test_unmanaged_subview_reset )
+{
+#if 0
+  TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d8ea958a5d4911d0fa39eb34f6fc231cb1e95e1c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp
@@ -0,0 +1,13 @@
+#include <qthreads/TestQthreads_SubView_c01.cpp>
+#include <qthreads/TestQthreads_SubView_c02.cpp>
+#include <qthreads/TestQthreads_SubView_c03.cpp>
+#include <qthreads/TestQthreads_SubView_c04.cpp>
+#include <qthreads/TestQthreads_SubView_c05.cpp>
+#include <qthreads/TestQthreads_SubView_c06.cpp>
+#include <qthreads/TestQthreads_SubView_c07.cpp>
+#include <qthreads/TestQthreads_SubView_c08.cpp>
+#include <qthreads/TestQthreads_SubView_c09.cpp>
+#include <qthreads/TestQthreads_SubView_c10.cpp>
+#include <qthreads/TestQthreads_SubView_c11.cpp>
+#include <qthreads/TestQthreads_SubView_c12.cpp>
+#include <qthreads/TestQthreads_SubView_c13.cpp>
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_Team.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Team.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b0d397804cf7c77144992b6c60bdde0921f0107
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_Team.cpp
@@ -0,0 +1,143 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, team_tag )
+{
+#if 0
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
+
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
+#endif
+}
+
+TEST_F( qthreads, team_shared_request )
+{
+#if 0
+  TestSharedTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+
+TEST_F( qthreads, team_scratch_request )
+{
+#if 0
+  TestScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+TEST_F( qthreads, team_lambda_shared_request )
+{
+#if 0
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+#endif
+
+TEST_F( qthreads, shmem_size )
+{
+#if 0
+  TestShmemSize< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, multi_level_scratch )
+{
+#if 0
+  TestMultiLevelScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+
+TEST_F( qthreads, team_vector )
+{
+#if 0
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 10 ) ) );
+#endif
+}
+
+#ifdef KOKKOS_COMPILER_GNU
+#if ( KOKKOS_COMPILER_GNU == 472 )
+#define SKIP_TEST
+#endif
+#endif
+
+#ifndef SKIP_TEST
+TEST_F( qthreads, triple_nested_parallelism )
+{
+#if 0
+  TestTripleNestedReduce< double, Kokkos::Qthreads >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, Kokkos::Qthreads >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, Kokkos::Qthreads >( 8192, 2048, 16, 16 );
+#endif
+}
+#endif
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a68049e79d19a92a015dc8936bc9c94ace1f17f5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_a.cpp
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, impl_view_mapping_a )
+{
+#if 0
+  test_view_mapping< Kokkos::Qthreads >();
+  test_view_mapping_operator< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3bfd1763b8179ded3bb080e8570da7339d32be52
--- /dev/null
+++ b/packages/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_b.cpp
@@ -0,0 +1,138 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, impl_shared_alloc )
+{
+#if 0
+  test_shared_alloc< Kokkos::HostSpace, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, impl_view_mapping_b )
+{
+#if 0
+  test_view_mapping_subview< Kokkos::Qthreads >();
+  TestViewMappingAtomic< Kokkos::Qthreads >::run();
+#endif
+}
+
+TEST_F( qthreads, view_api )
+{
+#if 0
+  TestViewAPI< double, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_nested_view )
+{
+#if 0
+  ::Test::view_nested_view< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_remap )
+{
+#if 0
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::Qthreads > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Qthreads > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Qthreads > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, view_aggregate )
+{
+#if 0
+  TestViewAggregate< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, template_meta_functions )
+{
+#if 0
+  TestTemplateMetaFunctions< int, Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_Category.hpp b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f77d83e8133fa4cf2fbbf401a1f64db94602225
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_Category.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class rocm_hostpinned : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY rocm_hostpinned
+#define TEST_EXECSPACE Kokkos::Experimental::ROCmHostPinnedSpace
+
+#endif
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_SharedAlloc.cpp b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_SharedAlloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..de058aaead45a2b5564e9380183221624b3b318d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_SharedAlloc.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmHostPinned_Category.hpp>
+#include <TestSharedAlloc.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, impl_shared_alloc )
+{
+  test_shared_alloc< TEST_EXECSPACE, Kokkos::DefaultHostExecutionSpace >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewAPI.cpp b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewAPI.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..93ddf8cc852a6f27ab5a1ff627387402f183d3d5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewAPI.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmHostPinned_Category.hpp>
+#include <TestViewAPI.hpp>
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e44f51778c3c034fa8c46089533f8c1a054b0f3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_a.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmHostPinned_Category.hpp>
+#include <TestViewMapping_a.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5d68a0c6174e9de302cc813d526e4bab1d6f5c3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_b.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmHostPinned_Category.hpp>
+#include <TestViewMapping_b.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_subview.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef9b8e3702913774614b0ba8f09c3b556c907f95
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCmHostPinned_ViewMapping_subview.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmHostPinned_Category.hpp>
+#include <TestViewMapping_subview.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_All.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_All.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a9c7e51b6230104eb8419980c93b79149a6881d6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_All.cpp
@@ -0,0 +1,33 @@
+#include "rocm/TestROCm_Init.cpp"
+
+//#include "rocm/TestROCm_Complex.cpp"
+#include "rocm/TestROCm_Reductions.cpp"
+//#include "rocm/TestROCm_RangePolicy.cpp"
+//#include "rocm/TestROCm_AtomicOperations.cpp"
+//#include "rocm/TestROCm_Atomics.cpp"
+//#include "rocm/TestROCm_AtomicViews.cpp"
+//#include "rocm/TestROCm_Other.cpp"
+//#include "rocm/TestROCm_Scan.cpp"
+//#include "rocm/TestROCm_SharedAlloc.cpp"
+//#include "rocm/TestROCm_SubView_a.cpp"
+//#include "rocm/TestROCm_SubView_b.cpp"
+//#include "rocm/TestROCm_SubView_c01.cpp"
+//#include "rocm/TestROCm_SubView_c02.cpp"
+//#include "rocm/TestROCm_SubView_c03.cpp"
+//#include "rocm/TestROCm_SubView_c04.cpp"
+//#include "rocm/TestROCm_SubView_c05.cpp"
+//#include "rocm/TestROCm_SubView_c06.cpp"
+//#include "rocm/TestROCm_SubView_c07.cpp"
+//#include "rocm/TestROCm_SubView_c08.cpp"
+//#include "rocm/TestROCm_SubView_c09.cpp"
+//#include "rocm/TestROCm_SubView_c10.cpp"
+//#include "rocm/TestROCm_SubView_c11.cpp"
+//#include "rocm/TestROCm_SubView_c12.cpp"
+//#include "rocm/TestROCm_Team.cpp"
+//#include "rocm/TestROCm_TeamReductionScan.cpp"
+//#include "rocm/TestROCm_TeamScratch.cpp"
+//#include "rocm/TestROCm_ViewAPI_b.cpp"
+//#include "rocm/TestROCm_ViewMapping_a.cpp"
+//#include "rocm/TestROCm_ViewMapping_b.cpp"
+//#include "rocm/TestROCm_ViewMapping_subview.cpp"
+//#include "rocm/TestROCm_ViewOfClass.cpp"
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_AtomicOperations.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_AtomicOperations.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50ed7ac442a2e4518b4e1f17aa172bd37b71090b
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_AtomicOperations.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<rocm/TestROCm_Category.hpp>
+#include<TestAtomicOperations.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_AtomicViews.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_AtomicViews.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..385fbe8bbbf1b8549f8a87dbc53a21b6981427f6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_AtomicViews.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<rocm/TestROCm_Category.hpp>
+#include<TestAtomicViews.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_Atomics.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_Atomics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7cbc3de2f19ec25f0733a03c2bd8a310dcab4a5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_Atomics.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestAtomic.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_Category.hpp b/packages/kokkos/core/unit_test/rocm/TestROCm_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d520bbc5a7c827f78c3ac33e46bbfaf7e1c757e9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_Category.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_ROCM_HPP
+#define KOKKOS_TEST_ROCM_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class rocm : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY rocm
+#define TEST_EXECSPACE Kokkos::Experimental::ROCm
+
+#endif
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_Complex.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_Complex.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5488925a5d6eea1617e026ca19929732b55a9edf
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_Complex.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<rocm/TestROCm_Category.hpp>
+#include<TestComplex.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_Init.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_Init.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1d47810bfbb02aeb4ae796595b4c2f3f33a53b8
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_Init.cpp
@@ -0,0 +1,50 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<rocm/TestROCm_Category.hpp>
+#include<TestInit.hpp>
+#include<TestCompilerMacros.hpp>
+#include<TestPolicyConstruction.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_MDRange.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_MDRange.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..765d0adfc2d5dc36e7f7b025d53ee4478eb6a0cd
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_MDRange.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<rocm/TestROCm_Category.hpp>
+#include<TestMDRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_Other.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_Other.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3396265b0fb5444b1044264ad126d1098fbfca4d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_Other.cpp
@@ -0,0 +1,52 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<rocm/TestROCm_Category.hpp>
+#include<TestTemplateMetaFunctions.hpp>
+#include<TestAggregate.hpp>
+//include<TestMemoryPool.hpp>
+#include<TestCXX11.hpp>
+#include<TestTile.hpp>
+
+#include<TestViewCtorPropEmbeddedDim.hpp>
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_RangePolicy.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_RangePolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd390a0a3df271961c3b2ea650b2f77629e1f31d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_RangePolicy.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<rocm/TestROCm_Category.hpp>
+#include<TestRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_Reductions.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_Reductions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..611bb34fd392af1ed6ffe625416adce7a7c9ba5d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_Reductions.cpp
@@ -0,0 +1,48 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestFunctorAnalysis.hpp>
+#include <TestReduce.hpp>
+#include <TestCXX11Deduction.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_Scan.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_Scan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f13a98331140cd3e1abcfa867741d566ac4ff8c5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_Scan.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<rocm/TestROCm_Category.hpp>
+#include<TestScan.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SharedAlloc.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SharedAlloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c2c68d021c806787c8d75e5f45ffe21ac6681e7d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SharedAlloc.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestSharedAlloc.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::Experimental::ROCmSpace, Kokkos::DefaultHostExecutionSpace >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_Spaces.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_Spaces.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04a7378ec9376f78ae4d77e4fd395b43fb2b50a3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_Spaces.cpp
@@ -0,0 +1,196 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <rocm/TestROCm_Category.hpp>
+
+namespace Test {
+
+KOKKOS_INLINE_FUNCTION
+void test_abort()
+{
+  Kokkos::abort( "test_abort" );
+}
+
+KOKKOS_INLINE_FUNCTION
+void test_rocm_spaces_int_value( int * ptr )
+{
+  if ( *ptr == 42 ) { *ptr = 2 * 42; }
+}
+
+TEST_F( rocm, space_access )
+{
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::HostSpace >::assignable, "" );
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::Experimental::ROCmHostPinnedSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::Experimental::ROCmSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::Experimental::ROCmSpace >::accessible, "" );
+
+  //--------------------------------------
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmSpace, Kokkos::Experimental::ROCmSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmSpace, Kokkos::Experimental::ROCmHostPinnedSpace >::assignable, "" );
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmSpace, Kokkos::Experimental::ROCmHostPinnedSpace >::accessible, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmSpace, Kokkos::HostSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmSpace, Kokkos::HostSpace >::accessible, "" );
+
+  //--------------------------------------
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace, Kokkos::Experimental::ROCmHostPinnedSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace, Kokkos::HostSpace >::assignable, "" );
+
+  static_assert(
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace, Kokkos::HostSpace >::accessible, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace, Kokkos::Experimental::ROCmSpace >::assignable, "" );
+
+  static_assert(
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace, Kokkos::Experimental::ROCmSpace >::accessible, "" );
+
+  //--------------------------------------
+
+  static_assert(
+    ! Kokkos::Impl::SpaceAccessibility< Kokkos::Experimental::ROCm, Kokkos::HostSpace >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility< Kokkos::Experimental::ROCm, Kokkos::Experimental::ROCmSpace >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility< Kokkos::Experimental::ROCm, Kokkos::Experimental::ROCmHostPinnedSpace >::accessible, "" );
+
+  static_assert(
+    ! Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, Kokkos::Experimental::ROCmSpace >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, Kokkos::Experimental::ROCmHostPinnedSpace >::accessible, "" );
+
+  static_assert(
+    std::is_same< Kokkos::Impl::HostMirror< Kokkos::Experimental::ROCmSpace >::Space
+                , Kokkos::HostSpace >::value, "" );
+
+  static_assert(
+    std::is_same< Kokkos::Impl::HostMirror< Kokkos::Experimental::ROCmHostPinnedSpace >::Space
+                , Kokkos::Experimental::ROCmHostPinnedSpace >::value, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility
+      < Kokkos::Impl::HostMirror< Kokkos::Experimental::ROCm >::Space
+      , Kokkos::HostSpace
+      >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility
+      < Kokkos::Impl::HostMirror< Kokkos::Experimental::ROCmSpace >::Space
+      , Kokkos::HostSpace
+      >::accessible, "" );
+
+  static_assert(
+    Kokkos::Impl::SpaceAccessibility
+      < Kokkos::Impl::HostMirror< Kokkos::Experimental::ROCmHostPinnedSpace >::Space
+      , Kokkos::HostSpace
+      >::accessible, "" );
+}
+
+template< class MemSpace, class ExecSpace >
+struct TestViewROCmAccessible {
+  enum { N = 1000 };
+
+  using V = Kokkos::View< double*, MemSpace >;
+
+  V m_base;
+
+  struct TagInit {};
+  struct TagTest {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit &, const int i ) const { m_base[i] = i + 1; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagTest &, const int i, long & error_count ) const
+  { if ( m_base[i] != i + 1 ) ++error_count; }
+
+  TestViewROCmAccessible()
+    : m_base( "base", N )
+    {}
+
+  static void run()
+  {
+    TestViewROCmAccessible self;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space, TagInit >( 0, N ), self );
+    MemSpace::execution_space::fence();
+
+    // Next access is a different execution space, must complete prior kernel.
+    long error_count = -1;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TagTest >( 0, N ), self, error_count );
+    EXPECT_EQ( error_count, 0 );
+  }
+};
+
+TEST_F( rocm, impl_view_accessible )
+{
+  TestViewROCmAccessible< Kokkos::Experimental::ROCmSpace, Kokkos::Experimental::ROCm >::run();
+
+  TestViewROCmAccessible< Kokkos::Experimental::ROCmHostPinnedSpace, Kokkos::Experimental::ROCm >::run();
+  TestViewROCmAccessible< Kokkos::Experimental::ROCmHostPinnedSpace, Kokkos::HostSpace::execution_space >::run();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_a.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd88d3d5ec9bdd15c5fa2807478a4278ceafcef9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_a.cpp
@@ -0,0 +1,104 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_assign_strided )
+{
+  TestViewSubview::test_1d_strided_assignment< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_0 )
+{
+  TestViewSubview::test_left_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_1 )
+{
+  TestViewSubview::test_left_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_2 )
+{
+  TestViewSubview::test_left_2< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_3 )
+{
+  TestViewSubview::test_left_3< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_0 )
+{
+  TestViewSubview::test_right_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_1 )
+{
+  TestViewSubview::test_right_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_3 )
+{
+  TestViewSubview::test_right_3< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_b.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b64dbf2140c592e43496c84c018411b6e310595
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_b.cpp
@@ -0,0 +1,63 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_layoutleft_to_layoutleft )
+{
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_layoutright_to_layoutright )
+{
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c01.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c01.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..29eea90807a443b0922ed9586b97e4254ade5351
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c01.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c02.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c02.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ba4d224e664e16f3a97e021b62ba40b4d1ae2c11
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c02.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c03.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c03.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d8816cc8ceb4cf5094059e26d56fc08363eaefc7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c03.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c04.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c04.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6ec5dcd47fb43b07cb454cc1ba2748a439825034
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c04.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c05.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c05.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5928200f783953880ddc5b922c416e1fb5eb468a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c05.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c06.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c06.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0954c53dca51750f094a5b2c5928fc0d3c583742
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c06.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c07.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c07.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..965d99f5dde5fcc636bf30272586e3f5fba0b2bb
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c07.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c08.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c08.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e02158a2c0a6f1a1bd2b0a592c7c2867614e1b06
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c08.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c09.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c09.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c6e67b1d134dad25856fa30267da4bd28c4c361
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c09.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c10.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c10.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bd27835cfd4937a65b37abd077d6741a7c1c9c0
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c10.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c11.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a537ddc20a921b5bea87379e4f4c08d8dfbb0f6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c11.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c12.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c12.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6a48f822cc56be2c967b904c29485ae4c51c532
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_SubView_c12.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCmUVM_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_Team.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_Team.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f35ab77d35c18d8ebe0d1c14bf5722408e089c5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_Team.cpp
@@ -0,0 +1,75 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_for )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+}
+
+
+TEST_F( TEST_CATEGORY, team_reduce )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
+}
+}
+
+#include <TestTeamVector.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_TeamReductionScan.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_TeamReductionScan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1025f1baa140c4d95fca606a58727eedde4f86c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_TeamReductionScan.cpp
@@ -0,0 +1,82 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+#if !defined(KOKKOS_ROCM_CLANG_WORKAROUND)
+TEST_F( TEST_CATEGORY, team_scan )
+{
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+}
+#endif
+
+TEST_F( TEST_CATEGORY, team_long_reduce )
+{
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( TEST_CATEGORY, team_double_reduce )
+{
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_TeamScratch.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_TeamScratch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1968ab31ea0a705d97d8a8780d4cca9573174a93
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_TeamScratch.cpp
@@ -0,0 +1,83 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_shared_request )
+{
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+TEST_F( TEST_CATEGORY, team_scratch_request )
+{
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_ROCM) || ( 8000 <= ROCM_VERSION )
+TEST_F( TEST_CATEGORY, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+#endif
+#endif
+
+TEST_F( TEST_CATEGORY, shmem_size )
+{
+  TestShmemSize< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_ViewAPI_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b307d9c81046a741c046b2fae56c7fc8251ecb26
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_ViewAPI_b.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestViewAPI.hpp>
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d79e6a63d57f99385686d948c10a43bd324b566
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_a.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestViewMapping_a.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..15bf92affe3d42dfd0ad6c5327def1e8dbf55344
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_b.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestViewMapping_b.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_subview.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7e35355215bd48188dff435184c8741148651ca5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_ViewMapping_subview.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestViewMapping_subview.hpp>
+
diff --git a/packages/kokkos/core/unit_test/rocm/TestROCm_ViewOfClass.cpp b/packages/kokkos/core/unit_test/rocm/TestROCm_ViewOfClass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c03385b64e779bb79b3c3526cc28611ecab9474c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/rocm/TestROCm_ViewOfClass.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <rocm/TestROCm_Category.hpp>
+#include <TestViewOfClass.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_AtomicOperations.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_AtomicOperations.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e8345f06516e47a18a6e4877c4d39fd450afc1f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_AtomicOperations.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestAtomicOperations.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_AtomicViews.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_AtomicViews.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a03c74a00826de3eae80bc9694a7350e07adb81d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_AtomicViews.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestAtomicViews.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..398ab10ccff423ca03fe3a1e0d10b2ffe9395aee
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestAtomic.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Category.hpp b/packages/kokkos/core/unit_test/serial/TestSerial_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f052a2e7755d86909ae8d770197f1917af581134
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Category.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class serial : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY serial
+#define TEST_EXECSPACE Kokkos::Serial
+
+#endif
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Complex.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Complex.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa9eacd2b33e6d76cbaf549e0a4650ef2ad41c0c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Complex.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestComplex.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Crs.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Crs.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..367c1028a8deceae575fd26ef752673b30f7b010
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Crs.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestCrs.hpp>
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Init.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Init.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ccadb041ce134da319840211eddeab93ea2133bd
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Init.cpp
@@ -0,0 +1,50 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestInit.hpp>
+#include<TestCompilerMacros.hpp>
+#include<TestPolicyConstruction.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_MDRange.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_MDRange.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6be3a8aa209c54ed37e7cc422c6eaa062385ac9d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_MDRange.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestMDRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Other.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Other.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c3bae3774446b0d76f75ebdc5ccec89b0f9a4c5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Other.cpp
@@ -0,0 +1,52 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestTemplateMetaFunctions.hpp>
+#include<TestAggregate.hpp>
+#include<TestMemoryPool.hpp>
+#include<TestCXX11.hpp>
+#include<TestTile.hpp>
+
+#include<TestViewCtorPropEmbeddedDim.hpp>
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_RangePolicy.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_RangePolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5740eb74fc6a12fc4e516e0b60555776bf3e8967
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_RangePolicy.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b5f9b0dfa69c15efbccbafba99e9b3d3aed0ae6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp
@@ -0,0 +1,47 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestReduce.hpp>
+#include <TestCXX11Deduction.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Scan.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Scan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..23ca25d48c2d6143b36aa3a7fb2f49a9d5fb2dbb
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Scan.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestScan.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SharedAlloc.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SharedAlloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..348ce654285924933e5547a6d93042a17a3c6736
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SharedAlloc.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestSharedAlloc.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::HostSpace, TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9041ae44123b0ca17935f708ae8415204a08088
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp
@@ -0,0 +1,104 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_assign_strided )
+{
+  TestViewSubview::test_1d_strided_assignment< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_0 )
+{
+  TestViewSubview::test_left_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_1 )
+{
+  TestViewSubview::test_left_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_2 )
+{
+  TestViewSubview::test_left_2< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_3 )
+{
+  TestViewSubview::test_left_3< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_0 )
+{
+  TestViewSubview::test_right_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_1 )
+{
+  TestViewSubview::test_right_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_3 )
+{
+  TestViewSubview::test_right_3< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6869f95f19bb8a31a1cc6e6159e8d5cd6d5820f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp
@@ -0,0 +1,63 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_layoutleft_to_layoutleft )
+{
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_layoutright_to_layoutright )
+{
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..395880493fe0cafb1b07149cb90d43d227269e57
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5bb21aba7b178cf0810ddc19b5850ae61affb2f7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f112ea326aec41dededeb0d4180a0583897509d
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fdcc3b90fa11f49720a67e433da2a5fa1c54b2e3
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4df5e8508b7a404d868f764b41b2a36709788314
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( serial, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e715eccf6cae574cbd9a2c83effc35adabc58ab
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5189fc7d86acef945910b10a3826c9e2b1a5492
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96ae5d7374d8f87083027062ddd8e7daf23a5bc0
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4d716540a59ed45e0d15f0ffd04c94b5ca13c0e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e393e5d4d380eab7833f1ad693b291b06556c7fd
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e9ea9a601067d9f7d2faf632efd56743dbadc64e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..01b57903cf87883a236f84ca6573094bea40cb64
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c13.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c13.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f50d28b251dc18816fba30c6efde6437f4c2e559
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c13.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
+{
+  TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..53e8573ea841e110916eac5c41cf9d8e935a6b04
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp
@@ -0,0 +1,13 @@
+#include <serial/TestSerial_SubView_c01.cpp>
+#include <serial/TestSerial_SubView_c02.cpp>
+#include <serial/TestSerial_SubView_c03.cpp>
+#include <serial/TestSerial_SubView_c04.cpp>
+#include <serial/TestSerial_SubView_c05.cpp>
+#include <serial/TestSerial_SubView_c06.cpp>
+#include <serial/TestSerial_SubView_c07.cpp>
+#include <serial/TestSerial_SubView_c08.cpp>
+#include <serial/TestSerial_SubView_c09.cpp>
+#include <serial/TestSerial_SubView_c10.cpp>
+#include <serial/TestSerial_SubView_c11.cpp>
+#include <serial/TestSerial_SubView_c12.cpp>
+#include <serial/TestSerial_SubView_c13.cpp>
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Task.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2d748d942a596cb8542100f691230a2ee7da6ab
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Task.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestTaskScheduler.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_Team.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_Team.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..619cb727ac9e94e257c5bdc8298e34e136295bf2
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_Team.cpp
@@ -0,0 +1,75 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_for )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+}
+
+
+TEST_F( TEST_CATEGORY, team_reduce )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
+}
+}
+
+#include <TestTeamVector.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_TeamReductionScan.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_TeamReductionScan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e626beaeb6158e684055b437746df0658abb7872
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_TeamReductionScan.cpp
@@ -0,0 +1,81 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, team_scan )
+{
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+}
+
+TEST_F( TEST_CATEGORY, team_long_reduce )
+{
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( TEST_CATEGORY, team_double_reduce )
+{
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_TeamScratch.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_TeamScratch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..963908c92d76b63904128cb7b1bb355ed708aecc
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_TeamScratch.cpp
@@ -0,0 +1,83 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_shared_request )
+{
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+TEST_F( TEST_CATEGORY, team_scratch_request )
+{
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+TEST_F( TEST_CATEGORY, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+#endif
+#endif
+
+TEST_F( TEST_CATEGORY, shmem_size )
+{
+  TestShmemSize< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c80b897ab6f864fd34efc5c7ac43d79c7903cf19
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewAPI.hpp>
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_ViewMapping_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f9f89d714c2c247fac2228faf39d7f8f39dbe834
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_ViewMapping_a.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewMapping_a.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_ViewMapping_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..69024441ac6c8f06f9de23ba027687b5d00c0cf5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_ViewMapping_b.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewMapping_b.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_ViewMapping_subview.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7fb364291a95e831bbe1127b43e041e849e550c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_ViewMapping_subview.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewMapping_subview.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_ViewOfClass.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_ViewOfClass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d3e10238df2bbbd8e4ebc0c6fb7740930deba769
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_ViewOfClass.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <serial/TestSerial_Category.hpp>
+#include <TestViewOfClass.hpp>
+
diff --git a/packages/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp b/packages/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..80925cb9dfa5e7fde94a55b0bb06cb57de2989c6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestWorkGraph.hpp>
diff --git a/packages/kokkos/core/unit_test/testmake.sh b/packages/kokkos/core/unit_test/testmake.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b5d4e8874d6bbd632bb7875bb931935018671195
--- /dev/null
+++ b/packages/kokkos/core/unit_test/testmake.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+if test "`basename $PWD`" = "cmaketest"; then 
+  outfile=$1
+else
+  outfile=config/tmpstore/$1
+fi
+
+grep_arch=`grep KOKKOS_ARCH    $outfile | grep $2 2>&1`
+grep_devs=`grep KOKKOS_DEVICES $outfile | grep $3 2>&1`
+if test -n "$grep_arch"; then 
+  if test -n "$grep_devs"; then 
+    echo Passed
+  else
+    echo Failed
+  fi
+else
+  echo Failed
+fi
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads.hpp b/packages/kokkos/core/unit_test/threads/TestThreads.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..42279ee1cf53d8076d77db882c2eb83f79518f27
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads.hpp
@@ -0,0 +1,111 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+#include <TestTile.hpp>
+//#include <TestSharedAlloc.hpp>
+//#include <TestViewAPI.hpp>
+//#include <TestViewOfClass.hpp>
+//#include <TestViewSubview.hpp>
+//#include <TestAtomic.hpp>
+//#include <TestAtomicOperations.hpp>
+//#include <TestAtomicViews.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+//#include <TestReduce.hpp>
+//#include <TestScan.hpp>
+//#include <TestAggregate.hpp>
+//#include <TestCompilerMacros.hpp>
+
+//TODO enable task scheduler tests for threads
+//#include <TestTaskScheduler.hpp>
+
+//#include <TestMemoryPool.hpp>
+//#include <TestCXX11.hpp>
+//#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+//#include <TestTemplateMetaFunctions.hpp>
+//#include <TestPolicyConstruction.hpp>
+//#include <TestMDRange.hpp>
+
+namespace Test {
+
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+    unsigned threads_count = 0;
+
+    threads_count = std::max( 1u, numa_count )
+                  * std::max( 2u, cores_per_numa * threads_per_core );
+
+    Kokkos::Threads::initialize( threads_count );
+    Kokkos::print_configuration( std::cout, true /* detailed */ );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Threads::finalize();
+  }
+};
+
+} // namespace Test
+
+#endif
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_AtomicOperations.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_AtomicOperations.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d9804cb9027d7f6afa4ceaf358f7a5b2f7f0c12
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_AtomicOperations.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestAtomicOperations.hpp>
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_AtomicViews.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_AtomicViews.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d71baf8fc2c129d54b034b7e62f5db859b8a69cc
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_AtomicViews.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestAtomicViews.hpp>
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..350a8d90fd995e67d424f311dcc7c1176b8c8bd5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestAtomic.hpp>
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_Category.hpp b/packages/kokkos/core/unit_test/threads/TestThreads_Category.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c4d0ed6da2958ed239e96ed4d5c64369d85e4fab
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_Category.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_THREADS_HPP
+#define KOKKOS_TEST_THREADS_HPP
+
+#include <gtest/gtest.h>
+
+namespace Test {
+
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase() {
+  }
+
+  static void TearDownTestCase() {
+  }
+};
+
+} // namespace Test
+
+#define TEST_CATEGORY threads
+#define TEST_EXECSPACE Kokkos::Threads
+
+#endif
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_Complex.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_Complex.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a19aaea9ea6b3238410932201b7b919bce157f95
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_Complex.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestComplex.hpp>
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_Crs.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_Crs.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f5a1de15fabd565f04d317dbb3190ec3d30ca858
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_Crs.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestCrs.hpp>
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_Init.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_Init.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39ded8fae0acb59f8a9da29a3f430e7fdc2bf345
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_Init.cpp
@@ -0,0 +1,50 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestInit.hpp>
+#include<TestCompilerMacros.hpp>
+#include<TestPolicyConstruction.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_MDRange.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_MDRange.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79a134d5dd11144e667aee1e0e51f367de8d4232
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_MDRange.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestMDRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_Other.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_Other.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a0c8b4159735bbd90d6fd896a7d3f5c18efde241
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_Other.cpp
@@ -0,0 +1,52 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestTemplateMetaFunctions.hpp>
+#include<TestAggregate.hpp>
+#include<TestMemoryPool.hpp>
+#include<TestCXX11.hpp>
+#include<TestTile.hpp>
+
+#include<TestViewCtorPropEmbeddedDim.hpp>
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_RangePolicy.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_RangePolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7404cae78688ebd0277f9462c46a019872d690e7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_RangePolicy.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestRange.hpp>
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33788b9ca5e443d7c0f97d27f1eb7417e43ab4fc
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestReduce.hpp>
+#include <TestCXX11Deduction.hpp>
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_Scan.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_Scan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d9583e9658453e62135a9befe0ff8ea820c2a3e
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_Scan.cpp
@@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestScan.hpp>
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SharedAlloc.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SharedAlloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a028728ecb85d27b6b9ff331620509dfce11bcaa
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SharedAlloc.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestSharedAlloc.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::HostSpace, TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cccf334fc4fafd7b66588ac627f334716a3572d9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp
@@ -0,0 +1,104 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_assign_strided )
+{
+  TestViewSubview::test_1d_strided_assignment< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_0 )
+{
+  TestViewSubview::test_left_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_1 )
+{
+  TestViewSubview::test_left_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_2 )
+{
+  TestViewSubview::test_left_2< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_left_3 )
+{
+  TestViewSubview::test_left_3< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_0 )
+{
+  TestViewSubview::test_right_0< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_1 )
+{
+  TestViewSubview::test_right_1< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_right_3 )
+{
+  TestViewSubview::test_right_3< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab1aa3df4fecb19bc6f962f2f8f79012292097b6
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp
@@ -0,0 +1,63 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_layoutleft_to_layoutleft )
+{
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+TEST_F( TEST_CATEGORY, view_subview_layoutright_to_layoutright )
+{
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3000d4c717f4b0519ab30563d3546b75753713be
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..84f1907576e0064ffc77f6082c44ee8eff145efc
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8076ecf1f93b8aebefcc58379bc26a1fe46129e5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd354fa1f247bb8090618cd56802abb630a972c9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..88286df275e8d424f952748f57119ccc2ebf42e5
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( threads, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..12c3bbcbde9cedc6e3aaaacd9b05119167f74b25
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17a0804dd67611af9c88d99dda499f346157e8b7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76b9dc39944e0ad0286b193fa4ebb68d13ed6a1a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..efa1f516cb9817f58aeaa1128ddc10c59aa24428
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..90f56d257314576868d7ab2821574fcac68174fa
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..35f4994ba2e502fd7ac3d957989bd5ce832a6e8f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05b5dc2cc59c1e7800405106814dc2fd16fdbb3f
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< TEST_EXECSPACE, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c13.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c13.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50cd7cecd9ba660e5a59b738a7f3904a90c079d9
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_SubView_c13.cpp
@@ -0,0 +1,54 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewSubview.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
+{
+  TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
+}
+
+} // namespace Test
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_Team.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_Team.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b87c1f77d3c60384ec5f067897ee77ea75e86244
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_Team.cpp
@@ -0,0 +1,75 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_for )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+}
+
+
+TEST_F( TEST_CATEGORY, team_reduce )
+{
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
+}
+}
+
+#include <TestTeamVector.hpp>
+
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_TeamReductionScan.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_TeamReductionScan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1df976f10dfb4ddab0c493ae4a6fad84e338ac1c
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_TeamReductionScan.cpp
@@ -0,0 +1,81 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+
+TEST_F( TEST_CATEGORY, team_scan )
+{
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+}
+
+TEST_F( TEST_CATEGORY, team_long_reduce )
+{
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( TEST_CATEGORY, team_double_reduce )
+{
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_TeamScratch.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_TeamScratch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c07fae77c3964f59331518cfb54c1f16561e6958
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_TeamScratch.cpp
@@ -0,0 +1,83 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestTeam.hpp>
+
+namespace Test {
+
+TEST_F( TEST_CATEGORY, team_shared_request )
+{
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+TEST_F( TEST_CATEGORY, team_scratch_request )
+{
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
+TEST_F( TEST_CATEGORY, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+#endif
+#endif
+
+TEST_F( TEST_CATEGORY, shmem_size )
+{
+  TestShmemSize< TEST_EXECSPACE >();
+}
+
+TEST_F( TEST_CATEGORY, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+} // namespace Test
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8d5a6f5ff2af38cbb6049b3876d14d96f901415
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewAPI.hpp>
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_ViewMapping_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d67f376fc609aa24d5d5ed8526bdafb83d033f19
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_ViewMapping_a.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewMapping_a.hpp>
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_ViewMapping_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..49c7f6d3d75714bab5689a9b44ea39a82186aa82
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_ViewMapping_b.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewMapping_b.hpp>
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_ViewMapping_subview.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c4f14873e040eebf0378c04597652c0589364e7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_ViewMapping_subview.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewMapping_subview.hpp>
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_ViewOfClass.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_ViewOfClass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9a9c17feaec5af5bce08b3e1eed40ee6e92f102
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_ViewOfClass.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <threads/TestThreads_Category.hpp>
+#include <TestViewOfClass.hpp>
+
diff --git a/packages/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp b/packages/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7801d1179c70b6ba5c760fcfac68fbbc6131a07
--- /dev/null
+++ b/packages/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestWorkGraph.hpp>
diff --git a/packages/kokkos/doc/Doxyfile b/packages/kokkos/doc/Doxyfile
new file mode 100644
index 0000000000000000000000000000000000000000..bc5c7486b27fc55ede35359b969af0a8008f960b
--- /dev/null
+++ b/packages/kokkos/doc/Doxyfile
@@ -0,0 +1,127 @@
+#
+# Include the global look and feel options
+#
+@INCLUDE               = ../../common/Doxyfile
+#
+# Package options
+#
+PROJECT_NAME           = "Kokkos Core Kernels Package"
+PROJECT_NUMBER         = "Version of the Day"
+OUTPUT_DIRECTORY       = .
+OUTPUT_LANGUAGE        = English
+
+EXTRACT_ALL            = NO
+EXTRACT_PRIVATE        = NO
+EXTRACT_STATIC         = YES
+HIDE_UNDOC_MEMBERS     = YES
+HIDE_UNDOC_CLASSES     = YES
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ALWAYS_DETAILED_SEC    = YES
+FULL_PATH_NAMES        = NO
+STRIP_FROM_PATH        = 
+INTERNAL_DOCS          = NO
+CLASS_DIAGRAMS         = YES
+SOURCE_BROWSER         = YES
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = YES
+REFERENCED_BY_RELATION = NO
+REFERENCES_RELATION    = NO
+CASE_SENSE_NAMES       = YES
+HIDE_SCOPE_NAMES       = NO
+VERBATIM_HEADERS       = YES
+SHOW_INCLUDE_FILES     = YES
+#JAVADOC_AUTOBRIEF      = YES
+INHERIT_DOCS           = YES
+INLINE_INHERITED_MEMB  = YES
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = NO
+TAB_SIZE               = 2
+ENABLED_SECTIONS       = 
+SORT_BRIEF_DOCS        = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+QUIET                  = NO
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_FORMAT            = "$file:$line: $text"
+
+#
+# INPUT: Where to find files that Doxygen should process.  ../classic
+# has a doc/ subdirectory with its own Doxyfile that points to its own
+# files.  The other Kokkos subpackages don't currently have their own
+# Doxyfile files, so we have to do it manually here.
+#
+# mfh 26 Sep 2013: I've only added those directories in the Core
+# subpackage that constitute the "public interface" of that
+# subpackage.  Please feel free to include additional subdirectories
+# of ../core if you want to generate their documentation as well.
+#
+# mfh 26 Sep 2013: I've only added the Kokkos subpackages here that I
+# think are ready for Doxygen documentation generation.  Please feel
+# free to amend this list as you see fit.
+#
+
+INPUT                  = index.doc ../classic ../core/src ../containers/src ../linalg/src
+FILE_PATTERNS          = *.hpp *.cpp *.cuh *.cu
+RECURSIVE              = NO
+EXCLUDE_PATTERNS       = *.x *.o *.out
+EXAMPLE_PATH           = 
+EXAMPLE_RECURSIVE       = YES
+EXAMPLE_PATTERNS       = *.cpp *.hpp
+IMAGE_PATH             = 
+INPUT_FILTER           = 
+ALPHABETICAL_INDEX     = YES
+COLS_IN_ALPHA_INDEX    = 4
+IGNORE_PREFIX          = 
+#
+# What diagrams are created
+#
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = NO
+INCLUDE_GRAPH          = NO
+INCLUDED_BY_GRAPH      = NO
+GRAPHICAL_HIERARCHY    = YES
+#
+# Preprocessing
+#
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = YES
+SEARCH_INCLUDES        = YES
+INCLUDE_FILE_PATTERNS  = 
+PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS DOXYGEN_USE_ONLY
+INCLUDE_PATH           = ../src
+EXPAND_AS_DEFINED      = 
+#
+# Links to other packages
+#
+TAGFILES               = ../../common/tag_files/teuchos.tag=../../../teuchos/doc/html ../../common/tag_files/epetra.tag=../../../epetra/doc/html \
+                         ../../common/tag_files/belos.tag=../../../belos/doc/html ../../common/tag_files/anasazi.tag=../../../anasazi/doc/html \
+                         ../../common/tag_files/kokkos.tag=../../../kokkos/doc/html 
+GENERATE_TAGFILE       = ../../common/tag_files/tpetra.tag
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = NO
+#
+# Environment
+#
+PERL_PATH              = /usr/bin/perl
+HAVE_DOT               = YES
+DOT_PATH               = 
+MAX_DOT_GRAPH_WIDTH    = 1024
+MAX_DOT_GRAPH_HEIGHT   = 1024
+#
+# What kind of documentation is generated
+#
+#GENERATE_HTML          = YES
+#HTML_OUTPUT            = html
+#HTML_HEADER            = includes/header.html
+#HTML_FOOTER            = includes/footer.html
+#HTML_STYLESHEET        = includes/stylesheet.css
+#HTML_ALIGN_MEMBERS     = YES
+GENERATE_HTMLHELP      = NO
+DISABLE_INDEX          = NO
+GENERATE_LATEX         = NO
+GENERATE_RTF           = NO
+GENERATE_MAN           = NO
+GENERATE_XML           = NO
diff --git a/packages/kokkos/doc/Kokkos_PG.pdf b/packages/kokkos/doc/Kokkos_PG.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..3c415698c0d9fec315f317b71db19f2a019b6f6e
Binary files /dev/null and b/packages/kokkos/doc/Kokkos_PG.pdf differ
diff --git a/packages/kokkos/doc/SAND2017-10464-Kokkos-Task-DAG.pdf b/packages/kokkos/doc/SAND2017-10464-Kokkos-Task-DAG.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..571ebff401044a4f4c2d32c2b948280e25f745b5
Binary files /dev/null and b/packages/kokkos/doc/SAND2017-10464-Kokkos-Task-DAG.pdf differ
diff --git a/packages/kokkos/doc/build_docs b/packages/kokkos/doc/build_docs
new file mode 100755
index 0000000000000000000000000000000000000000..da1d3e4f6e061804b1fb2fe21b356b691494df5d
--- /dev/null
+++ b/packages/kokkos/doc/build_docs
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+if [ $TRILINOS_HOME ]; then
+  echo "TRILINOS_HOME has already been set!"
+else
+  echo "TRILINOS_HOME has not been set.  Setting it!"
+  export TRILINOS_HOME=`pwd`/../../..
+fi
+
+echo
+echo "Generating main Kokkos doxygen documentation ..."
+echo
+
+doxygen Doxyfile
+
diff --git a/packages/kokkos/doc/design_notes_space_instances.md b/packages/kokkos/doc/design_notes_space_instances.md
new file mode 100644
index 0000000000000000000000000000000000000000..0124dfbc873285255fa92ff171dc5873056495ab
--- /dev/null
+++ b/packages/kokkos/doc/design_notes_space_instances.md
@@ -0,0 +1,131 @@
+# Design Notes for Execution and Memory Space Instances
+
+## Objective
+
+ * Enable Kokkos interoperability with coarse-grain tasking models
+ 
+## Requirements
+
+ * Backwards compatable with existing Kokkos API
+ * Support existing Host execution spaces (Serial, Threads, OpenMP, maybe Qthreads)
+ * Support DARMA threading model (may require a new Host execution space)
+ * Support Uintah threading model, i.e. indepentant worker threadpools working of of shared task queues
+ 
+  
+## Execution Space
+
+  * Parallel work is *dispatched* on an execution space instance
+  
+  * Execution space instances are conceptually disjoint/independant from each other 
+  
+
+## Host Execution Space Instances
+
+  *  A host-side *control* thread dispatches work to an instance
+
+  * `main` is the initial control thread
+
+  *  A host execution space instance is an organized thread pool
+
+  *  All instances are disjoint, i.e. hardware resources are not shared between instances
+
+  *  Exactly one control thread is associated with
+     an instance and only that control thread may
+     dispatch work to to that instance
+
+  *  The control thread is a member of the instance
+
+  *  The pool of threads associated with an instances is not mutatable during that instance existance
+
+  *  The pool of threads associated with an instance may be masked
+
+    -  Allows work to be dispatched to a subset of the pool
+
+    -  Example: only one hyperthread per core of the instance
+
+    -  A mask can be applied during the policy creation of a parallel algorithm
+ 
+    -  Masking is portable by defining it as ceiling of fraction between [0.0, 1.0] 
+       of the available resources
+
+```
+class ExecutionSpace {
+public:
+  using execution_space = ExecutionSpace;
+  using memory_space = ...;
+  using device_type = Kokkos::Device<execution_space, memory_space>;
+  using array_layout = ...;
+  using size_type = ...;
+  using scratch_memory_space = ...;
+  
+  
+  class Instance
+  {
+    int thread_pool_size( int depth = 0 );
+    ...
+  };
+  
+  class InstanceRequest
+  {
+  public:
+    using Control = std::function< void( Instance * )>;
+    
+    InstanceRequest( Control control
+                   , unsigned thread_count
+                   , unsigned use_numa_count = 0
+                   , unsigned use_cores_per_numa = 0
+                   );    
+  
+  };
+  
+  static bool in_parallel();
+  
+  static bool sleep();
+  static bool wake();
+  
+  static void fence();
+  
+  static void print_configuration( std::ostream &, const bool detailed = false );
+  
+  static void initialize( unsigned thread_count = 0
+                        , unsigned use_numa_count = 0
+                        , unsigned use_cores_per_numa = 0
+                        );
+  
+  // Partition the current instance into the requested instances
+  // and run the given functions on the cooresponding instances
+  // will block until all the partitioned instances complete and 
+  // the original instance will be restored 
+  //
+  // Requires that the space has already been initialized
+  // Requires that the request can be statisfied by the current instance
+  //   i.e. the sum of number of requested threads must be less than the 
+  //   max_hardware_threads
+  //
+  // Each control functor will accept a handle to its new default instance
+  // Each instance must be independant of all other instances 
+  //   i.e. no assumption on scheduling between instances
+  // The user is responible for checking the return code for errors
+  static int run_instances( std::vector< InstanceRequest> const& requests );
+  
+  static void finalize();
+
+  static int is_initialized();
+  
+  static int concurrency();
+  
+  static int thread_pool_size( int depth = 0 );
+  
+  static int thread_pool_rank();
+  
+  static int max_hardware_threads();
+  
+  static int hardware_thread_id();
+                        
+ };
+
+```
+ 
+
+
+
diff --git a/packages/kokkos/doc/develop_builds.md b/packages/kokkos/doc/develop_builds.md
new file mode 100644
index 0000000000000000000000000000000000000000..9a211fa77644ebbfcdef414e10b2afd02afc5940
--- /dev/null
+++ b/packages/kokkos/doc/develop_builds.md
@@ -0,0 +1,76 @@
+
+# Places to build options: architecture, device, advanced options, cuda options
+
+These are the files that need to be updated when a new architecture or device is
+added:
+
+  + generate_makefile.bash
+      * Interface for makefile system
+  + cmake/kokkos_options.cmake
+      * Interface for cmake system
+  + Makefile.kokkos
+      * Main logic for build (make and cmake) and defines (KokkosCore_config.h)
+  + core/unit_test/UnitTestConfig.make
+      * Unit test for Makefile.kokkos
+
+In general, an architecture is going to be from on of these platforms:
+  + AMD
+  + ARM
+  + IBM
+  + Intel
+  + Intel Xeon Phi
+  + NVIDIA
+Although not strictly necessary, it is helpful to keep things organized by
+grouping by platform.
+
+### generate_makefile.sh
+
+The bash code does not do any error checking on the `--arch=`  or `--device=`
+arguments thus strictly speaking you do not *need* to do anything to add a 
+device or architecture; however, you should add it to the help menu.  For the
+archictectures, please group by one of the platforms listed above.
+
+
+### cmake/kokkos_options.cmake and cmake/kokkos_settings.cmake
+
+The options for the CMake build system are: `-DKOKKOS_HOST_ARCH:STRING=` and
+`-DKOKKOS_ENABLE_<device>:BOOL=`.  Although any string can be passed into
+KOKKOS_HOST_ARCH option, it is checked against an accepted list.  Likewise, the
+KOKKOS_ENABLE_<device> must have the option added AND it is formed using the
+list. Thus: 
+  + A new architecture should be added to the KOKKOS_HOST_ARCH_LIST variable.
+  + A new device should be added to the KOKKOS_DEVICES_LIST variable **AND** a
+    KOKKOS_ENABLE_<newdevice> option specified (see KOKKOS_ENABLE_CUDA for
+    example).
+  + A new device should be added to the KOKKOS_DEVICES_LIST variable **AND** a
+
+The translation from option to the `KOKKOS_SETTINGS` is done in
+`kokkos_settings.cmake`.  This translation is automated for some types if you ad
+to the list, but for others, it may need to be hand coded. 
+
+
+### Makefile.kokkos
+
+This is the main coding used by both the make and cmake system for defining
+the sources (generated makefile and cmake snippets by `core/src/Makefile`), for
+setting the defines in KokkosCore_config.h, and defining various internal
+variables.  To understand how to add to this file, you should work closely with
+the Kokkos development team.
+
+
+### core/unit_test/UnitTestConfig.make
+
+This file is used to check the build system in a platform-independent way.  It
+works by looping over available architectures and devices; thus, you should add
+your new architecure to KOKKOS_ARCH_OPTIONS and your new device to 
+KOKKOS_DEVICE_OPTIONS to be tested.  The build system tests work by grepping the
+generated build files (automatically).  The header file tests work by diffing
+the generated file with results that are stored in
+`core/unit_tests/config/results` (namespaced by ARCH_DEVICE_).  Thus, you will
+need to add accepted results to this directory for diffing.
+
+The CMake build system is also tested in `core/unit_tests/config/cmaketest`.
+Because it uses cmake/kokkos_options.cmake, it already has the tests to loop
+over.  It is diffed with the same files that the build system is tested with.
+Thus, if you are consistent in all of the files listed, the unit tests should
+pass automatically.
diff --git a/packages/kokkos/doc/hardware_identification/query_cuda_arch.cpp b/packages/kokkos/doc/hardware_identification/query_cuda_arch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..383f04e34e08949142e33ede90a6e294642c1ba8
--- /dev/null
+++ b/packages/kokkos/doc/hardware_identification/query_cuda_arch.cpp
@@ -0,0 +1,24 @@
+#include <cstdio>
+#include <cuda_runtime_api.h>
+int main()
+{
+	cudaDeviceProp prop;
+  const cudaError_t err_code = cudaGetDeviceProperties(&prop, 0);
+  if (cudaSuccess != err_code) {
+		fprintf(stderr,"cudaGetDeviceProperties failed: %s\n", cudaGetErrorString(err_code));
+		return -1;
+	}
+  switch (prop.major) {
+    case 3:
+      printf("Kepler"); break;
+    case 5:
+      printf("Maxwell"); break;
+    case 6:
+      printf("Pascal"); break;
+    default:
+      fprintf(stderr, "Unspported Device %d%d\n", (int)prop.major, (int)prop.minor);
+      return -1;
+  }
+  printf("%d%d\n", (int)prop.major, (int)prop.minor);
+  return 0;
+}
diff --git a/packages/kokkos/doc/index.doc b/packages/kokkos/doc/index.doc
new file mode 100644
index 0000000000000000000000000000000000000000..27a9e4f2e7b90e11bbcde7309e9bf1544e3b386f
--- /dev/null
+++ b/packages/kokkos/doc/index.doc
@@ -0,0 +1,72 @@
+/*! 
+\mainpage Trilinos/Kokkos: Shared-memory programming interface and computational kernels
+
+\section Kokkos_Intro Introduction
+
+The %Kokkos package has two main components.  The first, sometimes
+called "%Kokkos Array" or just "%Kokkos," implements a
+performance-portable shared-memory parallel programming model and data
+containers.  The second, called "%Kokkos Classic," consists of
+computational kernels that support the %Tpetra package.
+
+\section Kokkos_Kokkos The %Kokkos programming model
+
+%Kokkos implements a performance-portable shared-memory parallel
+programming model and data containers.  It lets you write an algorithm
+once, and just change a template parameter to get the optimal data
+layout for your hardware.  %Kokkos has back-ends for the following
+parallel programming models:
+
+- Kokkos::Threads: POSIX Threads (Pthreads)
+- Kokkos::OpenMP: OpenMP
+- Kokkos::Cuda: NVIDIA's CUDA programming model for graphics
+  processing units (GPUs)
+- Kokkos::Serial: No thread parallelism
+
+%Kokkos also has optimizations for shared-memory parallel systems with
+nonuniform memory access (NUMA).  Its containers can hold data of any
+primitive ("plain old") data type (and some aggregate types).  %Kokkos
+Array may be used as a stand-alone programming model.
+
+%Kokkos' parallel operations include the following:
+
+- parallel_for: a thread-parallel "for loop"
+- parallel_reduce: a thread-parallel reduction
+- parallel_scan: a thread-parallel prefix scan operation
+
+as well as expert-level platform-independent interfaces to thread
+"teams," per-team "shared memory," synchronization, and atomic update
+operations.
+
+%Kokkos' data containers include the following:
+
+- Kokkos::View: A multidimensional array suitable for thread-parallel
+  operations.  Its layout (e.g., row-major or column-major) is
+  optimized by default for the particular thread-parallel device.
+- Kokkos::Vector: A drop-in replacement for std::vector that eases
+  porting from standard sequential C++ data structures to %Kokkos'
+  parallel data structures.
+- Kokkos::UnorderedMap: A parallel lookup table comparable in
+  functionality to std::unordered_map.
+
+%Kokkos also uses the above basic containers to implement higher-level
+data structures, like sparse graphs and matrices.
+
+A good place to start learning about %Kokkos would be <a href="http://trilinos.sandia.gov/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf">these tutorial slides</a> from the 2013 Trilinos Users' Group meeting.
+
+\section Kokkos_Classic %Kokkos Classic
+
+"%Kokkos Classic" consists of computational kernels that support the
+%Tpetra package.  These kernels include sparse matrix-vector multiply,
+sparse triangular solve, Gauss-Seidel, and dense vector operations.
+They are templated on the type of objects (\c Scalar) on which they
+operate.  This component was not meant to be visible to users; it is
+an implementation detail of the %Tpetra distributed linear algebra
+package.  
+
+%Kokkos Classic also implements a shared-memory parallel programming
+model.  This inspired and preceded the %Kokkos programming model
+described in the previous section.  Users should consider the %Kokkos
+Classic programming model deprecated, and prefer the new %Kokkos
+programming model.
+*/
diff --git a/packages/kokkos/doc/kokkos-promotion.txt b/packages/kokkos/doc/kokkos-promotion.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f245fa0f177dee1fcfe70b743f6a1e0d0fd36fa
--- /dev/null
+++ b/packages/kokkos/doc/kokkos-promotion.txt
@@ -0,0 +1,140 @@
+Summary:
+
+- Step 1: Testing Kokkos itself using test_all_sandia
+
+- Step 2: Testing of Kokkos integrated into Trilinos (scripts/trilinos-integration/*.sh)
+
+- Step 3: Locally update CHANGELOG, merge into master, edit scripts/master_history.txt
+
+- Step 4: Locally snapshot new master into corresponding Trilinos branch (develop or temporary), push with checking-test-sems.sh
+
+- Step 5: Push local Kokkos master to GitHub (need Owner approval)
+
+Steps 1, 2, and 4 include testing that may fail. These failures must be fixed either by pull requests to Kokkos develop, or by creating a new Trilinos branch for parts of Trilinos that must be updated. This is what usually takes the most time.
+
+
+// -------------------------------------------------------------------------------- //
+
+
+Step 1: The following should be repeated on enough machines to cover all
+supported compilers. Those machines are:
+
+    kokkos-dev
+    ??? <- TODO: identify other machines
+
+  1.1. Clone kokkos develop branch (or just switch to it)
+
+         git clone -b develop git@github.com:kokkos/kokkos.git
+         cd kokkos
+
+  1.2. Create a testing directory
+
+         mkdir testing
+         cd testing
+
+  1.3. Run the test_all_sandia script with no options to test all compilers
+
+         nohup ../scripts/test_all_sandia &
+         tail -f nohup.out                   # to watch progress
+
+// -------------------------------------------------------------------------------- //
+
+Step 2:
+  2.1. Build and test Trilinos with 4 different configurations; Run scripts for white and shepard that are provided in kokkos/scripts/trilinos-integration. These scripts load their own modules/environment, so don't require preparation. You can run all four at the same time, use separate directories for each.
+
+         mkdir serial
+         cd serial
+         nohup KOKKOS_PATH/scripts/trilinos-integration/shepard_jenkins_run_script_serial_intel &
+
+  2.2. Compare the compile errors and test failures between updated and pristine versions. There may be compile failures that happen in both, tests that fail in both, and there may be tests that only fail some times (thus, rerun tests manually as needed).
+
+// -------------------------------------------------------------------------------- //
+
+Step 3: This step should be run on kokkos-dev
+
+  3.1. If you don't have a GitHub token already, generate one for yourself (this will give you TOKEN):
+
+       https://github.com/settings/tokens
+
+  3.2. Get a clean copy of the Kokkos develop branch
+
+       git clone -b develop git@github.com:kokkos/kokkos.git
+       cd kokkos
+
+  3.3. Generate the initial changelog. Use the most recent tag as OLDTAG (`git tag -l` can show you all tags). The NEWTAG is the new version number, e.g. "2.04.00". RUN THIS OUTSIDE THE KOKKOS SOURCE TREE!
+
+       module load ruby/2.3.1/gcc/5.3.0
+       gitthub_changelog_generator kokkos/kokkos --token TOKEN --no-pull-requests --include-labels 'InDevelop' --enhancement-labels 'enhancement,Feature Request' --future-release 'NEWTAG' --between-tags 'NEWTAG,OLDTAG'
+       cat CHANGELOG.md
+
+  3.4. Manually cleanup and commit the change log. Pushing to develop requires Owner permission.
+       (Copy the new section from the generated CHANGELOG.md to KOKKOS_PATH/CHANGELOG.md)
+       (Make desired changes to CHANGELOG.md to enhance clarity (remove issues not noteworthy))
+       (Commit and push the CHANGELOG.md to develop)
+
+  3.5. Merge develop into master. DO NOT FAST-FORWARD THE MERGE!!!!
+
+       (From kokkos directory):
+       git checkout master
+       git merge --no-ff origin/develop
+
+  3.6. Update the tag in kokkos/scripts/master_history.txt
+
+       Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate
+       Tag field widths: #.#.##
+       date description: month:day:year
+       date field widths: ##:##:####
+       master description: SHA1 of previous master commit (use `git log`?)
+       develop description: SHA1 of merged develop branch
+       SHA1 field width: ######## (8 chars)
+
+       # Append to scripts/master_history.txt:
+
+       tag:  2.03.13    date: 07:27:2017    master: da314444    develop: 29ccb58a
+       
+       git commit --amend -a
+
+
+  3.7. Create the new tag:
+
+       git tag -a #.#.##
+
+         (type the following into the tag message (same as for step 4.3))
+         tag: #.#.##
+         date: mm/dd/yyyy
+         master: sha1
+         develop: sha1
+
+  3.8. DO NOT PUSH YET !!!
+
+
+// -------------------------------------------------------------------------------- //
+
+Step 4: This step can be done on any SEMS machine (e.g. kokkos-dev). Actually, the checkin step requires lots of disk space and RAM. Use ceerws1113 if you have access to it.
+
+  4.1 Clone the Trilinos corresponding branch (or just switch to it)
+
+        git clone -b develop git@github.com:trilinos/Trilinos.git
+        TRILINOS_PATH=$PWD/Trilinos
+
+  4.2 Snapshot Kokkos into Trilinos - this requires python/2.7.9 and that both Trilinos and Kokkos be clean - no untracked or modified files. Run the following outside of the Kokkos and Trilinos source trees.
+
+        module load sems-python/2.7.9
+        python KOKKOS_PATH/scripts/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
+
+  4.3. Run checkin-test to push to trilinos using the CI build modules (gcc/4.9.3)
+
+       cd TRILINOS_PATH
+       mkdir CHECKIN
+       cd CHECKIN
+       nohup ../cmake/std/sems/checkin-test-sems.sh --do-all --push &
+
+  4.4. If there are failures, fix and backtrack. Otherwise, go to next step
+
+// -------------------------------------------------------------------------------- //
+
+Step 5: Push Kokkos master to GitHub (requires Owner permission).
+      
+       cd KOKKOS_PATH
+       git push --follow-tags origin master 
+
diff --git a/packages/kokkos/example/CMakeLists.txt b/packages/kokkos/example/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3809cc2ea57a26ba1b3003a2e706fee912ccabc9
--- /dev/null
+++ b/packages/kokkos/example/CMakeLists.txt
@@ -0,0 +1,20 @@
+
+
+# Subpackage name must match what appears in kokkos/cmake/Dependencies.cmake
+#
+TRIBITS_SUBPACKAGE(Example)
+
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(query_device)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(fixture)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(feint)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(fenl)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(multi_fem)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(md_skeleton)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(global_2_local_ids)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(grow_array)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(sort_array)
+if(NOT Kokkos_ENABLE_Cuda)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(tutorial)
+endif()
+TRIBITS_SUBPACKAGE_POSTPROCESS()
+
diff --git a/packages/kokkos/example/README b/packages/kokkos/example/README
new file mode 100644
index 0000000000000000000000000000000000000000..ec64004842b0f254de2f1d67a9cb5c272bf15607
--- /dev/null
+++ b/packages/kokkos/example/README
@@ -0,0 +1,16 @@
+This directory contains example application proxies that use different
+parts of Kokkos.  If you are looking for the FENL ("finite element
+nonlinear" solve) example, it has moved into the LinAlg subpackage of
+Tpetra.
+
+MANIFEST:
+
+  - common:  Header files used by different examples
+  - feint:   Unstructured finite-element method
+  - fixture: Some other finite-element method example
+  - global_2_local_ids: Example of global-to-local index lookup
+  - grow_array:   Parallel dynamic memory allocation
+  - md_skeleton:  Molecular dynamics
+  - query_device: Kokkos' HWLOC wrapper for querying device topology
+  - sort_array:   Parallel sort
+  - tutorial:     Kokkos tutorial (START HERE)
diff --git a/packages/kokkos/example/cmake/Dependencies.cmake b/packages/kokkos/example/cmake/Dependencies.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ed1ec4c7259ef88fbbc28b9fcbde6d81ae8adf1f
--- /dev/null
+++ b/packages/kokkos/example/cmake/Dependencies.cmake
@@ -0,0 +1,3 @@
+TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
+  TEST_OPTIONAL_TPLS CUSPARSE MKL
+  )
diff --git a/packages/kokkos/example/cmake_build/CMakeLists.txt b/packages/kokkos/example/cmake_build/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8e1aa047279e8cb3477a8ecb65883156d23fb21c
--- /dev/null
+++ b/packages/kokkos/example/cmake_build/CMakeLists.txt
@@ -0,0 +1,44 @@
+# Kokkos requires CMake version 3.1 or higher and that you have the following
+# line with a version of 3.1 or higher as the first line of your project:
+#   cmake_minimum_required(VERSION 3.1)
+#
+# The other CMake commands required to build Kokkos as part of your application
+# are:
+#   add_subdirectory(path/to/kokkos)
+#   target_link_libraries(executable or library)
+#
+# If Kokkos is not a subdirectory of your project, you will also need to pass a
+# binary directory to add_subdirectory().  We had to pass the binary directory
+# for this example for that reason.  Note that target_link_libraries() can be
+# called on a target added by add_executable(), add_library(), or another
+# similar command.
+#
+# All the flags, etc. required to build using the Kokkos library are
+# transitively added to targets which depend on the library.
+#
+# The CMake variables CMAKE_CXX_STANDARD and CMAKE_CXX_EXTENSIONS are
+# respected.  We recommend that you set CMAKE_CXX_EXTENSIONS to OFF.
+# Otherwise, CMake defaults to using extensions for the C++ standard, and the
+# GNU extensions (-std=gnu++11) will be used for compilers that support it
+# instead of standard C++11 (-std=c++11).
+#
+# A bunch of build options are added as variables (all starting with KOKKOS_)
+# to the build.  Check them out using ccmake or the CMake GUI.
+#
+# Building this example:
+#   1. Create a build directory.
+#   2. cd /path/to/build/directory
+#   3. cmake /path/to/example
+#   4. make
+
+cmake_minimum_required(VERSION 3.1)
+project(Example CXX C Fortran)
+
+list(APPEND CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -O3)
+
+add_subdirectory(${Example_SOURCE_DIR}/../.. ${Example_BINARY_DIR}/kokkos)
+
+include_directories(${Kokkos_INCLUDE_DIRS_RET})
+
+add_executable(example cmake_example.cpp foo.f)
+target_link_libraries(example kokkos)
diff --git a/packages/kokkos/example/cmake_build/cmake_example.cpp b/packages/kokkos/example/cmake_build/cmake_example.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a73b65abf05239328ac6f6c5b98ce3d8b0b53733
--- /dev/null
+++ b/packages/kokkos/example/cmake_build/cmake_example.cpp
@@ -0,0 +1,91 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+extern "C" void print_fortran_();
+
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  Kokkos::DefaultExecutionSpace::print_configuration(std::cout);
+
+  if (argc < 2) {
+    fprintf(stderr, "Usage: %s [<kokkos_options>] <size>\n", argv[0]);
+    Kokkos::finalize();
+    exit(1);
+  }
+
+  const long n = strtol(argv[1], NULL, 10);
+
+  printf("Number of even integers from 0 to %ld\n", n - 1);
+
+  Kokkos::Timer timer;
+  timer.reset();
+
+  // Compute the number of even integers from 0 to n-1, in parallel.
+  long count = 0;
+  Kokkos::parallel_reduce(n, KOKKOS_LAMBDA (const long i, long& lcount) {
+    lcount += (i % 2) == 0;
+  }, count);
+
+  double count_time = timer.seconds();
+  printf("  Parallel: %ld    %10.6f\n", count, count_time);
+
+  timer.reset();
+
+  // Compare to a sequential loop.
+  long seq_count = 0;
+  for (long i = 0; i < n; ++i) {
+    seq_count += (i % 2) == 0;
+  }
+
+  count_time = timer.seconds();
+  printf("Sequential: %ld    %10.6f\n", seq_count, count_time);
+
+  print_fortran_();
+
+  Kokkos::finalize();
+
+  return (count == seq_count) ? 0 : -1;
+}
diff --git a/packages/kokkos/example/cmake_build/foo.f b/packages/kokkos/example/cmake_build/foo.f
new file mode 100644
index 0000000000000000000000000000000000000000..e618455283b65602d98a5de00c8dc2abc6b0f8c2
--- /dev/null
+++ b/packages/kokkos/example/cmake_build/foo.f
@@ -0,0 +1,4 @@
+        FUNCTION print_fortran()
+          PRINT *, 'Hello World from Fortran'
+          RETURN
+        END
diff --git a/packages/kokkos/example/common/VectorImport.hpp b/packages/kokkos/example/common/VectorImport.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..566f03a2931e02574a0c064bdc60adb3eef4c33d
--- /dev/null
+++ b/packages/kokkos/example/common/VectorImport.hpp
@@ -0,0 +1,294 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VECTORIMPORT_HPP
+#define KOKKOS_VECTORIMPORT_HPP
+
+#include <utility>
+#include <limits>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include <Kokkos_Core.hpp>
+
+#include <WrapMPI.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template< class CommMessageType , class CommIdentType , class VectorType >
+struct VectorImport ;
+
+} // namespace Example
+} // namespace Kokkos
+
+#if ! defined( KOKKOS_ENABLE_MPI )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+template< class CommMessageType , class CommIdentType , class VectorType >
+struct VectorImport {
+
+  const MPI_Comm comm ;
+  const unsigned count_owned ;
+  const unsigned count_receive ;
+
+  VectorImport( MPI_Comm arg_comm ,
+                const CommMessageType & ,
+                const CommMessageType & ,
+                const CommIdentType   & ,
+                const unsigned arg_count_owned ,
+                const unsigned arg_count_receive )
+    : comm( arg_comm )
+    , count_owned( arg_count_owned )
+    , count_receive( arg_count_receive )
+    {}
+
+  inline
+  void operator()( const VectorType & ) const {}
+};
+
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#else /* defined( KOKKOS_ENABLE_MPI ) */
+
+namespace Kokkos {
+namespace Example {
+
+template< class CommMessageType , class CommIdentType , class VectorType >
+class VectorImport {
+private:
+
+  // rank == 1 or array_layout == LayoutRight
+  enum { OK = Kokkos::Impl::StaticAssert<
+           ( VectorType::rank == 1 ) ||
+           std::is_same< typename VectorType::array_layout , Kokkos::LayoutRight >::value
+         >::value };
+
+  typedef typename VectorType::HostMirror HostVectorType ;
+
+  enum { ReceiveInPlace =
+    std::is_same< typename VectorType::memory_space ,
+                           typename HostVectorType::memory_space >::value };
+
+  const CommMessageType  recv_msg ;
+  const CommMessageType  send_msg ;
+  const CommIdentType    send_nodeid ;
+  VectorType             send_buffer ;
+  HostVectorType         host_send_buffer ;
+  HostVectorType         host_recv_buffer ;
+  unsigned               chunk ;
+
+public:
+
+  const MPI_Comm         comm ;
+  const unsigned         count_owned ;
+  const unsigned         count_receive ;
+
+  struct Pack {
+    typedef typename VectorType::execution_space execution_space ;
+    const CommIdentType  index ;
+    const VectorType     source ;
+    const VectorType     buffer ;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const unsigned i ) const
+      { buffer( i ) = source( index(i) ); }
+
+    Pack( const CommIdentType  & arg_index ,
+          const VectorType     & arg_source ,
+          const VectorType     & arg_buffer )
+      : index( arg_index )
+      , source( arg_source )
+      , buffer( arg_buffer )
+    {
+      Kokkos::parallel_for( index.dimension_0() , *this );
+      execution_space::fence();
+    }
+  };
+
+  VectorImport( MPI_Comm arg_comm ,
+                const CommMessageType & arg_recv_msg ,
+                const CommMessageType & arg_send_msg ,
+                const CommIdentType   & arg_send_nodeid ,
+                const unsigned arg_count_owned ,
+                const unsigned arg_count_receive )
+    : recv_msg( arg_recv_msg )
+    , send_msg( arg_send_msg )
+    , send_nodeid( arg_send_nodeid )
+    , send_buffer()
+    , host_send_buffer()
+    , host_recv_buffer()
+    , comm( arg_comm )
+    , count_owned( arg_count_owned )
+    , count_receive( arg_count_receive )
+    {
+      if ( ! ReceiveInPlace ) {
+        host_recv_buffer = HostVectorType("recv_buffer",count_receive);
+      }
+
+      unsigned send_count = 0 ;
+      for ( unsigned i = 0 ; i < send_msg.dimension_0() ; ++i ) { send_count += send_msg(i,1); }
+      send_buffer      = VectorType("send_buffer",send_count);
+      host_send_buffer = Kokkos::create_mirror_view( send_buffer );
+    }
+
+  inline
+  void operator()( const VectorType & v ) const
+  {
+    typedef typename VectorType::value_type  scalar_type ;
+
+    const int mpi_tag = 42 ;
+    const unsigned chunk = v.dimension_1();
+
+    // Subvector for receives
+    const std::pair<unsigned,unsigned> recv_range( count_owned , count_owned + count_receive );
+    const VectorType recv_vector = Kokkos::subview( v , recv_range );
+
+    std::vector< MPI_Request > recv_request( recv_msg.dimension_0() , MPI_REQUEST_NULL );
+
+    { // Post receives
+      scalar_type * ptr =
+        ReceiveInPlace ? recv_vector.ptr_on_device() : host_recv_buffer.ptr_on_device();
+
+      for ( size_t i = 0 ; i < recv_msg.dimension_0() ; ++i ) {
+        const int proc  = recv_msg(i,0);
+        const int count = recv_msg(i,1) * chunk ;
+
+        MPI_Irecv( ptr , count * sizeof(scalar_type) , MPI_BYTE ,
+                   proc , mpi_tag , comm , & recv_request[i] );
+
+        ptr += count ;
+      }
+    }
+
+    MPI_Barrier( comm );
+
+    { // Pack and send 
+      const Pack pack( send_nodeid , v , send_buffer );
+
+      Kokkos::deep_copy( host_send_buffer , send_buffer );
+
+      scalar_type * ptr = host_send_buffer.ptr_on_device();
+
+      for ( size_t i = 0 ; i < send_msg.dimension_0() ; ++i ) {
+        const int proc  = send_msg(i,0);
+        const int count = send_msg(i,1) * chunk ;
+
+        // MPI_Ssend blocks until
+        // (1) a receive is matched for the message and
+        // (2) the send buffer can be re-used.
+        //
+        // It is suggested that MPI_Ssend will have the best performance:
+        // http://www.mcs.anl.gov/research/projects/mpi/sendmode.html .
+
+        MPI_Ssend( ptr ,
+                   count * sizeof(scalar_type) , MPI_BYTE ,
+                   proc , mpi_tag , comm );
+
+        ptr += count ;
+      }
+    }
+
+    // Wait for receives and verify:
+
+    for ( size_t i = 0 ; i < recv_msg.dimension_0() ; ++i ) {
+      MPI_Status recv_status ;
+      int recv_which = 0 ;
+      int recv_size  = 0 ;
+
+      MPI_Waitany( recv_msg.dimension_0() , & recv_request[0] , & recv_which , & recv_status );
+
+      const int recv_proc = recv_status.MPI_SOURCE ;
+
+      MPI_Get_count( & recv_status , MPI_BYTE , & recv_size );
+
+      // Verify message properly received:
+
+      const int  expected_proc = recv_msg(recv_which,0);
+      const int  expected_size = recv_msg(recv_which,1) * chunk * sizeof(scalar_type);
+
+      if ( ( expected_proc != recv_proc ) ||
+           ( expected_size != recv_size ) ) {
+
+        int local_rank  = 0 ;
+
+        MPI_Comm_rank( comm , & local_rank );
+
+        std::ostringstream msg ;
+        msg << "VectorImport error:"
+            << " P" << local_rank
+            << " received from P" << recv_proc
+            << " size "     << recv_size
+            << " expected " << expected_size
+            << " from P"    << expected_proc ;
+        throw std::runtime_error( msg.str() );
+      }
+    }
+
+    // Copy received data to device memory.
+
+    if ( ! ReceiveInPlace ) { Kokkos::deep_copy( recv_vector , host_recv_buffer ); }
+  }
+};
+
+} // namespace Example
+} // namespace Kokkos
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VECTORIMPORT_HPP */
+
+
diff --git a/packages/kokkos/example/common/WrapMPI.hpp b/packages/kokkos/example/common/WrapMPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cad2bb5cdfe0d094fed6a6ac83977ef3e1e104da
--- /dev/null
+++ b/packages/kokkos/example/common/WrapMPI.hpp
@@ -0,0 +1,103 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_WRAP_MPI
+#define KOKKOS_EXAMPLE_WRAP_MPI
+
+#include <Kokkos_Macros.hpp>
+#include <string>
+
+#if defined( KOKKOS_ENABLE_MPI )
+
+#include <mpi.h>
+
+namespace Kokkos {
+namespace Example {
+
+inline
+double all_reduce( double value , MPI_Comm comm )
+{
+  double local = value ;
+  MPI_Allreduce( & local , & value , 1 , MPI_DOUBLE , MPI_SUM , comm );
+  return value ;
+}
+
+inline
+double all_reduce_max( double value , MPI_Comm comm )
+{
+  double local = value ;
+  MPI_Allreduce( & local , & value , 1 , MPI_DOUBLE , MPI_MAX , comm );
+  return value ;
+}
+
+} // namespace Example
+} // namespace Kokkos
+
+#elif ! defined( KOKKOS_ENABLE_MPI )
+
+/* Wrap the the MPI_Comm type and heavily used MPI functions
+ * to reduce the number of '#if defined( KOKKOS_ENABLE_MPI )'
+ * blocks which have to be sprinkled throughout the examples.
+ */
+
+typedef int MPI_Comm ;
+
+inline int MPI_Comm_size( MPI_Comm , int * size ) { *size = 1 ; return 0 ; }
+inline int MPI_Comm_rank( MPI_Comm , int * rank ) { *rank = 0 ; return 0 ; }
+inline int MPI_Barrier( MPI_Comm ) { return 0; }
+
+namespace Kokkos {
+namespace Example {
+
+inline
+double all_reduce( double value , MPI_Comm ) { return value ; }
+
+inline
+double all_reduce_max( double value , MPI_Comm ) { return value ; }
+
+} // namespace Example
+} // namespace Kokkos
+
+#endif /* ! defined( KOKKOS_ENABLE_MPI ) */
+#endif /* #ifndef KOKKOS_EXAMPLE_WRAP_MPI */
+
diff --git a/packages/kokkos/example/feint/CMakeLists.txt b/packages/kokkos/example/feint/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0018b9f9f538de77ce776daaa267a037714387ad
--- /dev/null
+++ b/packages/kokkos/example/feint/CMakeLists.txt
@@ -0,0 +1,18 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../common)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../fixture)
+
+SET(SOURCES "")
+
+FILE(GLOB SOURCES *.cpp)
+
+LIST( APPEND SOURCES ../fixture/BoxElemPart.cpp)
+
+TRIBITS_ADD_EXECUTABLE(
+  feint
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
diff --git a/packages/kokkos/example/feint/ElemFunctor.hpp b/packages/kokkos/example/feint/ElemFunctor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..eebe777d98004f9441c50d1a3aef20db47aae9a4
--- /dev/null
+++ b/packages/kokkos/example/feint/ElemFunctor.hpp
@@ -0,0 +1,485 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FEINT_FUNCTORS_HPP
+#define KOKKOS_EXAMPLE_FEINT_FUNCTORS_HPP
+
+#include <cstdio>
+#include <Kokkos_Core.hpp>
+#include <BoxElemFixture.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+/** \brief  Numerically integrate a function on a finite element mesh and
+ *          project the integrated values to nodes.
+ */
+template< class FixtureType ,
+          class FunctionType ,
+          bool PerformScatterAddWithAtomic >
+struct FiniteElementIntegration ;
+
+// Specialized for an 'Example::BoxElemFixture' finite element mesh
+template< class Device , BoxElemPart::ElemOrder ElemOrder , class GridMap ,
+          class FunctionType ,
+          bool PerformScatterAddWithAtomic >
+struct FiniteElementIntegration<
+  Kokkos::Example::BoxElemFixture< Device , ElemOrder , GridMap > ,
+  FunctionType ,
+  PerformScatterAddWithAtomic >
+{
+  // Element mesh types:
+  typedef Kokkos::Example::BoxElemFixture< Device , ElemOrder >
+    BoxFixtureType ;
+
+  typedef Kokkos::Example::HexElement_Data< BoxFixtureType::ElemNode >
+    HexElemDataType ;
+
+  enum { ElemNodeCount    = HexElemDataType::element_node_count  };
+  enum { IntegrationCount = HexElemDataType::integration_count };
+  enum { ValueCount       = FunctionType::value_count };
+
+  // Dictionary of view types:
+  typedef View<int*,                              Device> ElemErrorType ;
+  typedef View<double*[ElemNodeCount][ValueCount],Device> ElemValueType ;
+  typedef View<double*[ValueCount],               Device> NodeValueType ;
+
+  // Data members for this Functor:
+  const HexElemDataType  m_hex_elem_data ; ///< Master element
+  const BoxFixtureType   m_box_fixture ;   ///< Unstructured mesh data
+  const FunctionType     m_function ;      ///< Function to integrate
+  const ElemErrorType    m_elem_error ;    ///< Flags for element errors
+  const ElemValueType    m_elem_integral ; ///< Per-element quantities
+  const NodeValueType    m_node_lumped ;   ///< Quantities lumped to nodes
+
+  //----------------------------------------
+
+  FiniteElementIntegration(
+    const BoxFixtureType & box_fixture ,
+    const FunctionType   & function )
+    : m_hex_elem_data()
+    , m_box_fixture( box_fixture ) // Shallow copy of the mesh fixture
+    , m_function( function )
+    , m_elem_error(    "elem_error"    , box_fixture.elem_count() )
+    , m_elem_integral( "elem_integral" , box_fixture.elem_count() )
+    , m_node_lumped(   "node_lumped"   , box_fixture.node_count() )
+    {}
+
+  //----------------------------------------
+  // Device for parallel dispatch.
+  typedef typename Device::execution_space execution_space;
+
+  // Value type for global parallel reduction.
+  struct value_type {
+    double value[ ValueCount ]; ///< Integrated quantitie
+    int    error ;              ///< Element inversion flag
+  };
+
+  //----------------------------------------
+  // Transform element interpolation function gradients and
+  // compute determinant of spatial jacobian.
+  KOKKOS_INLINE_FUNCTION
+  float transform_gradients(
+    const float  grad[][  ElemNodeCount ] , // Gradient of bases master element
+    const double coord[][ ElemNodeCount ] ,
+          float  dpsi[][  ElemNodeCount ] ) const
+  {
+    enum { TensorDim = 9 };
+    enum { j11 = 0 , j12 = 1 , j13 = 2 ,
+           j21 = 3 , j22 = 4 , j23 = 5 ,
+           j31 = 6 , j32 = 7 , j33 = 8 };
+
+    // Temporary for jacobian accumulation is double for summation accuracy.
+    double J[ TensorDim ] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
+
+    for( int i = 0; i < ElemNodeCount ; ++i ) {
+      J[j11] += grad[0][i] * coord[0][i] ;
+      J[j12] += grad[0][i] * coord[1][i] ;
+      J[j13] += grad[0][i] * coord[2][i] ;
+
+      J[j21] += grad[1][i] * coord[0][i] ;
+      J[j22] += grad[1][i] * coord[1][i] ;
+      J[j23] += grad[1][i] * coord[2][i] ;
+
+      J[j31] += grad[2][i] * coord[0][i] ;
+      J[j32] += grad[2][i] * coord[1][i] ;
+      J[j33] += grad[2][i] * coord[2][i] ;
+    }
+
+    // Inverse jacobian, compute as double and store as float.
+    float invJ[ TensorDim ] = {
+      float( J[j22] * J[j33] - J[j23] * J[j32] ) ,
+      float( J[j13] * J[j32] - J[j12] * J[j33] ) ,
+      float( J[j12] * J[j23] - J[j13] * J[j22] ) ,
+
+      float( J[j23] * J[j31] - J[j21] * J[j33] ) ,
+      float( J[j11] * J[j33] - J[j13] * J[j31] ) ,
+      float( J[j13] * J[j21] - J[j11] * J[j23] ) ,
+
+      float( J[j21] * J[j32] - J[j22] * J[j31] ) ,
+      float( J[j12] * J[j31] - J[j11] * J[j32] ) ,
+      float( J[j11] * J[j22] - J[j12] * J[j21] ) };
+
+    const float detJ = J[j11] * invJ[j11] +
+                       J[j21] * invJ[j12] +
+                       J[j31] * invJ[j13] ;
+
+    {
+      const float detJinv = 1.0 / detJ ;
+      for ( int i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }
+    }
+
+    // Transform gradients:
+    for ( int i = 0; i < ElemNodeCount ; ++i ) {
+      dpsi[0][i] = grad[0][i] * invJ[j11] +
+                   grad[1][i] * invJ[j12] +
+                   grad[2][i] * invJ[j13];
+      dpsi[1][i] = grad[0][i] * invJ[j21] +
+                   grad[1][i] * invJ[j22] +
+                   grad[2][i] * invJ[j23];
+      dpsi[2][i] = grad[0][i] * invJ[j31] +
+                   grad[1][i] * invJ[j32] +
+                   grad[2][i] * invJ[j33];
+    }
+
+    return detJ ;
+  }
+
+  // Functor's function called for each element in the mesh
+  // to numerically integrate the function and add element quantities
+  // to the global integral.
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int ielem , value_type & update ) const
+  {
+    // Local temporaries for gathering nodal data.
+    double node_coord[3][ ElemNodeCount ];
+
+    int inode[ ElemNodeCount ] ;
+
+    // Gather indices of element's node from global memory to local memory.
+    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+      inode[i] = m_box_fixture.elem_node( ielem , i );
+    }
+
+    // Gather coordinates of element's nodes from global memory to local memory.
+    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+      node_coord[0][i] = m_box_fixture.node_coord( inode[i] , 0 );
+      node_coord[1][i] = m_box_fixture.node_coord( inode[i] , 1 );
+      node_coord[2][i] = m_box_fixture.node_coord( inode[i] , 2 );
+    }
+
+    // Local temporary to accumulate numerical integration
+    // of vector valued function.
+    double accum[ ValueCount ];
+
+    for ( int j = 0 ; j < ValueCount ; ++j ) { accum[j] = 0 ; }
+
+    int error = 0 ;
+
+    // Numerical integration loop for this element:
+    for ( int k = 0 ; k < IntegrationCount ; ++k ) {
+
+      // Integration point in space as interpolated from nodal coordinates:
+      double point[3] = { 0 , 0 , 0 };
+      for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+        point[0] += node_coord[0][i] * m_hex_elem_data.values[k][i] ;
+        point[1] += node_coord[1][i] * m_hex_elem_data.values[k][i] ;
+        point[2] += node_coord[2][i] * m_hex_elem_data.values[k][i] ;
+      }
+
+      // Example function vector value at cubature point:
+      double val_at_pt[ ValueCount ];
+      m_function( point , val_at_pt );
+
+      // Temporary array for transformed element basis functions' gradient.
+      // Not used in this example, but computed anyway by the more general
+      // deformation function.
+      float dpsi[3][ ElemNodeCount ];
+
+      // Compute deformation jacobian, transform basis function gradient,
+      // and return determinant of deformation jacobian.
+      float detJ = transform_gradients( m_hex_elem_data.gradients[k] ,
+                                        node_coord , dpsi );
+
+      // Check for inverted spatial jacobian
+      if ( detJ <= 0 ) { error = 1 ; detJ = 0 ; }
+
+      // Integration weight.
+      const float w = m_hex_elem_data.weights[k] * detJ ;
+
+      // Cubature of function.
+      for ( int j = 0 ; j < ValueCount ; ++j ) {
+        accum[j] += val_at_pt[j] * w ;
+      }
+    }
+
+    m_elem_error(ielem) = error ;
+
+
+    // Element contribution to global integral:
+
+    if ( error ) { update.error = 1 ; }
+
+    for ( int j = 0 ; j < ValueCount ; ++j ) { update.value[j] += accum[j] ; }
+
+    // Element-node quantity for lumping to nodes:
+    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+      for ( int j = 0 ; j < ValueCount ; ++j ) {
+        // Save element's integral apportionment to nodes to global memory
+        m_elem_integral( ielem , i , j ) = accum[j] / ElemNodeCount ;
+      }
+    }
+
+    if ( PerformScatterAddWithAtomic ) {
+      // Option to immediately scatter-add the integrated quantities to nodes.
+      // This is a race condition as two or more threads could attempt
+      // concurrent update of nodal values.  The atomic_fetch_add (+=)
+      // function guarantees that the summation will occur correctly;
+      // however, there can be no guarantee for the order of summation.
+      // Due to non-associativity of floating point arithmetic the result
+      // is non-deterministic within bounds of floating point round-off.
+
+      for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+        for ( int j = 0 ; j < ValueCount ; ++j ) {
+          Kokkos::atomic_fetch_add( & m_node_lumped( inode[i] , j ) ,
+                                    m_elem_integral( ielem , i , j ) );
+        }
+      }
+    }
+  }
+  //--------------------------------------------------------------------------
+
+  // Initialization of the global reduction value.
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const
+  {
+    for ( int j = 0 ; j < ValueCount ; ++j ) update.value[j] = 0 ;
+    update.error = 0 ;
+  }
+
+  // Join two contributions to global reduction value.
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile       value_type & update ,
+             volatile const value_type & input ) const
+  {
+    for ( int j = 0 ; j < ValueCount ; ++j ) update.value[j] += input.value[j] ;
+    if ( input.error ) update.error = 1 ;
+  }
+};
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+template< class ViewElemNode ,
+          class ViewNodeScan ,
+          class ViewNodeElem >
+void map_node_to_elem( const ViewElemNode & elem_node ,
+                       const ViewNodeScan & node_scan ,
+                       const ViewNodeElem & node_elem );
+
+/** \brief  Functor to gather-sum elements' per-node quantities
+ *          to element nodes.  Gather-sum is thread safe and
+ *          does not require atomic updates.
+ */
+template< class ViewNodeValue ,
+          class ViewElemValue ,
+          bool  AlreadyUsedAtomic >
+struct LumpElemToNode {
+
+  typedef typename ViewElemValue::execution_space execution_space ;
+
+  // In this example we know that the ViewElemValue
+  // array specification is < double*[nNode][nValue] >
+
+  enum { value_count = ViewElemValue::dimension::N2 };
+
+  ViewNodeValue             m_node_value ; ///< Integrated values at nodes
+  ViewElemValue             m_elem_value ; ///< Values apportioned to nodes
+  View<int*,   execution_space> m_node_scan ;  ///< Offsets for nodes->element
+  View<int*[2],execution_space> m_node_elem ;  ///< Node->element connectivity
+
+  // Only allocate node->element connectivity if have
+  // not already used atomic updates for the nodes.
+  template< class ViewElemNode >
+  LumpElemToNode( const ViewNodeValue & node_value ,
+                  const ViewElemValue & elem_value ,
+                  const ViewElemNode  & elem_node )
+    : m_node_value( node_value )
+    , m_elem_value( elem_value )
+    , m_node_scan( "node_scan" ,
+                   AlreadyUsedAtomic ? 0 : node_value.extent(0) + 1 )
+    , m_node_elem( "node_elem" ,
+                   AlreadyUsedAtomic ? 0 : elem_node.extent(0) *
+                                           elem_node.extent(1) )
+    {
+      if ( ! AlreadyUsedAtomic ) {
+        map_node_to_elem( elem_node , m_node_scan , m_node_elem );
+      }
+    }
+
+  //----------------------------------------
+
+  struct value_type { double value[ value_count ]; };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int inode , value_type & update ) const
+  {
+    if ( ! AlreadyUsedAtomic ) {
+      // Sum element quantities to a local variable.
+      value_type local ;
+      for ( int j = 0 ; j < value_count ; ++j ) { local.value[j] = 0 ; }
+
+      {
+        // nodes' element ids span [i,end)
+        int i = m_node_scan(inode);
+        const int end = m_node_scan(inode+1);
+
+        for ( ; i < end ; ++i ) {
+          // element #ielem , local node #ielem_node is this node:
+          const int ielem      = m_node_elem(i,0);
+          const int ielem_node = m_node_elem(i,1);
+          // Sum the vector-values quantity
+          for ( int j = 0 ; j < value_count ; ++j ) {
+            local.value[j] += m_elem_value( ielem , ielem_node , j );
+          }
+        }
+      }
+
+      // Assign nodal quantity (no race condition).
+      // Sum global value.
+      for ( int j = 0 ; j < value_count ; ++j ) {
+        m_node_value( inode , j ) = local.value[j] ;
+        update.value[j] += local.value[j] ;
+      }
+    }
+    else {
+      // Already used atomic update of the nodal quantity,
+      // query and sum the value.
+      for ( int j = 0 ; j < value_count ; ++j ) {
+        update.value[j] += m_node_value( inode , j );
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const
+    { for ( int j = 0 ; j < value_count ; ++j ) { update.value[j] = 0 ; } }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile       value_type & update ,
+             volatile const value_type & input ) const
+    {
+      for ( int j = 0 ; j < value_count ; ++j ) {
+        update.value[j] += input.value[j] ;
+      }
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class ViewElemNode ,
+          class ViewNodeScan ,
+          class ViewNodeElem >
+void map_node_to_elem( const ViewElemNode & elem_node ,
+                       const ViewNodeScan & node_scan ,
+                       const ViewNodeElem & node_elem )
+{
+  typedef typename ViewElemNode::host_mirror_space host_mirror_space ;
+
+  const typename ViewElemNode::HostMirror host_elem_node =
+    Kokkos::create_mirror_view(elem_node);
+
+  const typename ViewNodeScan::HostMirror host_node_scan =
+    Kokkos::create_mirror_view(node_scan);
+
+  const typename ViewNodeElem::HostMirror host_node_elem =
+    Kokkos::create_mirror_view(node_elem);
+
+  const int elem_count      = host_elem_node.extent(0);
+  const int elem_node_count = host_elem_node.extent(1);
+  const int node_count      = host_node_scan.extent(0) - 1 ;
+
+  const View<int*, host_mirror_space >
+    node_elem_count( "node_elem_count" , node_count );
+
+  Kokkos::deep_copy( host_elem_node , elem_node );
+
+  for ( int i = 0 ; i < elem_count ; ++i ) {
+    for ( int j = 0 ; j < elem_node_count ; ++j ) {
+      ++node_elem_count( host_elem_node(i,j) );
+    }
+  }
+
+  for ( int i = 0 ; i < node_count ; ++i ) {
+    host_node_scan(i+1) += host_node_scan(i) + node_elem_count(i);
+    node_elem_count(i) = 0 ;
+  }
+
+  for ( int i = 0 ; i < elem_count ; ++i ) {
+    for ( int j = 0 ; j < elem_node_count ; ++j ) {
+      const int inode  = host_elem_node(i,j);
+      const int offset = host_node_scan(inode) + node_elem_count(inode);
+
+      host_node_elem( offset , 0 ) = i ;
+      host_node_elem( offset , 1 ) = j ;
+
+      ++node_elem_count(inode);
+    }
+  }
+
+  Kokkos::deep_copy( node_scan , host_node_scan );
+  Kokkos::deep_copy( node_elem , host_node_elem );
+}
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FEINT_FUNCTORS_HPP */
+
diff --git a/packages/kokkos/example/feint/Makefile b/packages/kokkos/example/feint/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..3f68c1c1cc02a9f851e391de29ab5b74ae49a77b
--- /dev/null
+++ b/packages/kokkos/example/feint/Makefile
@@ -0,0 +1,63 @@
+KOKKOS_PATH = ../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+vpath %.cpp ${KOKKOS_SRC_PATH}/example/fixture ${KOKKOS_SRC_PATH}/example/feint
+
+EXAMPLE_HEADERS = $(wildcard $(KOKKOS_SRC_PATH)/example/common/*.hpp ${KOKKOS_SRC_PATH}/example/fixture/*.hpp ${KOKKOS_SRC_PATH}/example/feint/*.hpp)
+
+default: build_all
+	echo "End Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+else
+  CXX = g++
+endif
+
+CXXFLAGS = -O3
+LINK ?= $(CXX)
+LDFLAGS ?= 
+
+include $(KOKKOS_PATH)/Makefile.kokkos        
+
+KOKKOS_CXXFLAGS +=	\
+	-I${KOKKOS_SRC_PATH}/example/common	\
+	-I${KOKKOS_SRC_PATH}/example/fixture	\
+	-I${KOKKOS_SRC_PATH}/example/feint
+
+EXE_EXAMPLE_FEINT = KokkosExample_Feint
+OBJ_EXAMPLE_FEINT = BoxElemPart.o main.o
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  OBJ_EXAMPLE_FEINT += feint_cuda.o
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+  OBJ_EXAMPLE_FEINT += feint_rocm.o
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+  OBJ_EXAMPLE_FEINT += feint_threads.o
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+  OBJ_EXAMPLE_FEINT += feint_openmp.o
+endif
+
+TARGETS = $(EXE_EXAMPLE_FEINT)
+
+#TEST_TARGETS =
+
+$(EXE_EXAMPLE_FEINT) : $(OBJ_EXAMPLE_FEINT) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_EXAMPLE_FEINT) $(KOKKOS_LIBS) $(LIB) -o $(EXE_EXAMPLE_FEINT)
+
+build_all : $(TARGETS)
+
+test : build_all
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(EXAMPLE_HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/packages/kokkos/example/feint/feint.hpp b/packages/kokkos/example/feint/feint.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1be8762a85bd74e789ebadc7b810a5cddfb73183
--- /dev/null
+++ b/packages/kokkos/example/feint/feint.hpp
@@ -0,0 +1,165 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FEINT_HPP
+#define KOKKOS_EXAMPLE_FEINT_HPP
+
+#include <iostream>
+#include <BoxElemFixture.hpp>
+#include <ElemFunctor.hpp>
+#include <feint_fwd.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+/** \brief  Vector valued function to numerically integrate.
+ *
+ *  F(X) = { 1 , x , y , z , x*y , y*z , z*x , x*y*z }
+ *
+ *  Integrates on a unit cube to:
+ *    { 1 , 1/2 , 1/2 , 1/2 , 1/4 , 1/4 , 1/4 , 1/8 }
+ */
+struct MyFunctionType {
+
+  enum { value_count = 8 };
+
+  // Evaluate function at coordinate.
+  template< typename CoordType , typename ValueType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const CoordType point[] , ValueType value[] ) const
+    {
+      value[0] = 1 ;
+      value[1] = point[0] ;
+      value[2] = point[1] ;
+      value[3] = point[2] ;
+      value[4] = point[0] * point[1] ;
+      value[5] = point[1] * point[2] ;
+      value[6] = point[2] * point[0] ;
+      value[7] = point[0] * point[1] * point[2] ;
+    }
+};
+
+template < class Device , bool UseAtomic >
+void feint(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz )
+{
+  //----------------------------------------
+  // Create the unstructured finite element mesh box fixture on the device:
+
+  typedef Kokkos::Example::
+    BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear >
+    // BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
+      BoxFixtureType ;
+
+  // MPI distributed parallel domain decomposition of the fixture.
+  // Either by element (DecomposeElem) or by node (DecomposeNode)
+  // with ghosted elements.
+
+  static const Kokkos::Example::BoxElemPart::Decompose
+    decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ;
+    // decompose = Kokkos::Example::BoxElemPart:: DecomposeNode ;
+
+  // Not using MPI in this example.
+  const unsigned mpi_rank = 0 ;
+  const unsigned mpi_size = 1 ;
+
+  const BoxFixtureType fixture( decompose , mpi_size , mpi_rank ,
+                                global_elem_nx ,
+                                global_elem_ny ,
+                                global_elem_nz );
+
+  //----------------------------------------
+  // Create and execute the numerical integration functor on the device:
+
+  typedef Kokkos::Example::
+    FiniteElementIntegration< BoxFixtureType , MyFunctionType , UseAtomic >
+      FeintType ;
+
+  const FeintType feint( fixture , MyFunctionType() );
+
+  typename FeintType::value_type elem_integral ;
+
+  // A reduction for the global integral:
+  Kokkos::parallel_reduce( fixture.elem_count() , feint , elem_integral );
+
+  if ( elem_integral.error ) {
+    std::cout << "An element had a spatial jacobian error" << std::endl ;
+    return ;
+  }
+
+  std::cout << "Elem integral =" ;
+  for ( int i = 0 ; i < MyFunctionType::value_count ; ++i ) {
+    std::cout << " " << elem_integral.value[i] ;
+  }
+  std::cout << std::endl ;
+ 
+  //----------------------------------------
+  // Create and execute the nodal lumped value projection and reduction functor:
+
+  typedef Kokkos::Example::
+    LumpElemToNode< typename FeintType::NodeValueType ,
+                    typename FeintType::ElemValueType ,
+                    UseAtomic > LumpType ;
+
+  const LumpType lump( feint.m_node_lumped ,
+                       feint.m_elem_integral ,
+                       fixture.elem_node() );
+
+  typename LumpType ::value_type node_sum ;
+
+  Kokkos::parallel_reduce( fixture.node_count() , lump , node_sum );
+
+  std::cout << "Node lumped sum =" ;
+  for ( int i = 0 ; i < MyFunctionType::value_count ; ++i ) {
+    std::cout << " " << node_sum.value[i] ;
+  }
+  std::cout << std::endl ;
+}
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FEINT_HPP */
+
diff --git a/packages/kokkos/example/feint/feint_cuda.cpp b/packages/kokkos/example/feint/feint_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c2f4a68e42aabef38d03761b2af01ebec043cfe0
--- /dev/null
+++ b/packages/kokkos/example/feint/feint_cuda.cpp
@@ -0,0 +1,67 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_ENABLE_CUDA )
+
+#include <feint.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template void feint<Kokkos::Cuda,false>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+template void feint<Kokkos::Cuda,true>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif
+
diff --git a/packages/kokkos/example/feint/feint_fwd.hpp b/packages/kokkos/example/feint/feint_fwd.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc969ae572cce07890a8a0012ec4c707d3186c14
--- /dev/null
+++ b/packages/kokkos/example/feint/feint_fwd.hpp
@@ -0,0 +1,60 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FEINT_FWD_HPP
+#define KOKKOS_EXAMPLE_FEINT_FWD_HPP
+
+namespace Kokkos {
+namespace Example {
+
+template < class Device , bool UseAtomic >
+void feint(
+  const unsigned global_elem_nx = 100 ,
+  const unsigned global_elem_ny = 115 ,
+  const unsigned global_elem_nz = 130 );
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FEINT_FWD_HPP */
+
diff --git a/packages/kokkos/example/feint/feint_openmp.cpp b/packages/kokkos/example/feint/feint_openmp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c7e0b6a08364541bc926e32585591baca1de449
--- /dev/null
+++ b/packages/kokkos/example/feint/feint_openmp.cpp
@@ -0,0 +1,67 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#ifdef KOKKOS_ENABLE_OPENMP
+
+#include <feint.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template void feint<Kokkos::OpenMP,false>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+template void feint<Kokkos::OpenMP,true>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif
+
diff --git a/packages/kokkos/example/feint/feint_rocm.cpp b/packages/kokkos/example/feint/feint_rocm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..65debe44d11e3ff3b51031326f9ca061985a9017
--- /dev/null
+++ b/packages/kokkos/example/feint/feint_rocm.cpp
@@ -0,0 +1,67 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_ENABLE_ROCM )
+
+#include <feint.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template void feint<Kokkos::Experimental::ROCm,false>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+template void feint<Kokkos::Experimental::ROCm,true>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif
+
diff --git a/packages/kokkos/example/feint/feint_threads.cpp b/packages/kokkos/example/feint/feint_threads.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..560dbf8874ed670840529878ffd4948f42404c6a
--- /dev/null
+++ b/packages/kokkos/example/feint/feint_threads.cpp
@@ -0,0 +1,67 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_ENABLE_THREADS )
+
+#include <feint.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template void feint< Kokkos::Threads ,false>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+template void feint< Kokkos::Threads ,true>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
+
diff --git a/packages/kokkos/example/feint/main.cpp b/packages/kokkos/example/feint/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8e8b57cca972aaad80e14e103ea03c1cddeaf67c
--- /dev/null
+++ b/packages/kokkos/example/feint/main.cpp
@@ -0,0 +1,124 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#include <utility>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+#include <feint_fwd.hpp>
+
+int main()
+{
+#if defined( KOKKOS_ENABLE_THREADS )
+  {
+    // Use 4 cores per NUMA region, unless fewer available
+
+    const unsigned use_numa_count     = Kokkos::hwloc::get_available_numa_count();
+    const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() );
+
+    Kokkos::Threads::initialize( use_numa_count * use_cores_per_numa );
+
+    std::cout << "feint< Threads , NotUsingAtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::Threads , false >();
+
+    std::cout << "feint< Threads , Usingtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::Threads , true  >();
+
+    Kokkos::Threads::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_OPENMP )
+  {
+
+    int num_threads  = 0;
+    if ( Kokkos::hwloc::available() ) {
+      // Use 4 cores per NUMA region, unless fewer available
+      const unsigned use_numa_count     = Kokkos::hwloc::get_available_numa_count();
+      const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() );
+      num_threads = use_numa_count * use_cores_per_numa;
+
+    }
+    else {
+      #pragma omp parallel
+      {
+        #pragma omp atomic
+        ++num_threads;
+      }
+      num_threads = std::max(4, num_threads/4);
+    }
+
+
+    Kokkos::OpenMP::initialize( num_threads );
+
+    std::cout << "feint< OpenMP , NotUsingAtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::OpenMP , false >();
+
+    std::cout << "feint< OpenMP , Usingtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::OpenMP , true  >();
+
+    Kokkos::OpenMP::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  {
+    // Initialize Host mirror device
+    Kokkos::HostSpace::execution_space::initialize(1);
+    const unsigned device_count = Kokkos::Cuda::detect_device_count();
+
+    // Use the last device:
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(device_count-1) );
+
+    std::cout << "feint< Cuda , NotUsingAtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::Cuda , false >();
+
+    std::cout << "feint< Cuda , UsingAtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::Cuda , true  >();
+
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+
+  }
+#endif
+}
+
diff --git a/packages/kokkos/example/fenl/CGSolve.hpp b/packages/kokkos/example/fenl/CGSolve.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3d3b0bc6f7ca3bdda69481f5cdfac3bfb950988
--- /dev/null
+++ b/packages/kokkos/example/fenl/CGSolve.hpp
@@ -0,0 +1,296 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_CG_SOLVE
+#define KOKKOS_EXAMPLE_CG_SOLVE
+
+#include <cmath>
+#include <limits>
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+#include <WrapMPI.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+template< typename ValueType , class Space >
+struct CrsMatrix {
+  typedef Kokkos::StaticCrsGraph< unsigned , Space , void , unsigned >  StaticCrsGraphType ;
+  typedef View< ValueType * , Space > coeff_type ;
+
+  StaticCrsGraphType  graph ;
+  coeff_type          coeff ;
+
+  CrsMatrix() : graph(), coeff() {}
+
+  CrsMatrix( const StaticCrsGraphType & arg_graph )
+    : graph( arg_graph )
+    , coeff( "crs_matrix_coeff" , arg_graph.entries.extent(0) )
+    {}
+};
+
+template< typename MScalar 
+        , typename VScalar
+        , class Space >
+struct Multiply {
+
+  const Example::CrsMatrix< MScalar , Space >    m_A ;
+  const Kokkos::View< const VScalar * , Space > m_x ;
+  const Kokkos::View<       VScalar * , Space > m_y ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int iRow ) const
+    {
+      const int iEntryBegin = m_A.graph.row_map[iRow];
+      const int iEntryEnd   = m_A.graph.row_map[iRow+1];
+
+      double sum = 0 ;
+
+      for ( int iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
+        sum += m_A.coeff(iEntry) * m_x( m_A.graph.entries(iEntry) );
+      }
+
+      m_y(iRow) = sum ;
+    }
+
+  Multiply( const View<       VScalar * , Space > & y 
+          , const CrsMatrix< MScalar , Space >    & A 
+          , const View< const VScalar * , Space > & x 
+          )
+  : m_A( A ), m_x( x ), m_y( y )
+  {}
+};
+
+template< typename MScalar
+        , typename VScalar
+        , class Space >
+inline
+void multiply( const int nrow
+             , const Kokkos::View< VScalar * , Space >    & y
+             , const Example::CrsMatrix< MScalar , Space > & A
+             , const Kokkos::View< VScalar * , Space >    & x
+             )
+{
+  Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,nrow), Multiply<MScalar,VScalar,Space>( y , A , x ) );
+}
+
+template< typename ValueType , class Space >
+struct WAXPBY {
+  const Kokkos::View< const ValueType * , Space >  m_x ;
+  const Kokkos::View< const ValueType * , Space >  m_y ;
+  const Kokkos::View<       ValueType * , Space >  m_w ;
+  const double m_alpha ;
+  const double m_beta ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i ) const
+    { m_w(i) = m_alpha * m_x(i) + m_beta * m_y(i); }
+
+  WAXPBY( const View< ValueType * , Space >  & arg_w
+        , const double arg_alpha
+        , const View< ValueType * , Space >  & arg_x
+        , const double arg_beta
+        , const View< ValueType * , Space >  & arg_y
+        )
+    : m_x( arg_x )
+    , m_y( arg_y )
+    , m_w( arg_w )
+    , m_alpha( arg_alpha )
+    , m_beta( arg_beta )
+    {}
+};
+
+template< typename VScalar , class Space >
+void waxpby( const int n
+           , const Kokkos::View< VScalar * , Space > & arg_w
+           , const double                      arg_alpha
+           , const Kokkos::View< VScalar * , Space > & arg_x
+           , const double                      arg_beta
+           , const Kokkos::View< VScalar * , Space > & arg_y
+           )
+{
+  Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,n), WAXPBY<VScalar,Space>(arg_w,arg_alpha,arg_x,arg_beta,arg_y) );
+}
+
+template< typename VScalar , class Space >
+struct Dot {
+  typedef double value_type ;
+
+  const Kokkos::View< const VScalar * , Space >  m_x ;
+  const Kokkos::View< const VScalar * , Space >  m_y ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i , value_type & update ) const
+    { update += m_x(i) * m_y(i); }
+
+  Dot( const Kokkos::View< VScalar * , Space >  & arg_x
+     , const Kokkos::View< VScalar * , Space >  & arg_y
+     )
+    : m_x(arg_x), m_y(arg_y) {}
+};
+
+template< typename VScalar , class Space >
+double dot( const int n
+          , const Kokkos::View< VScalar * , Space > & arg_x
+          , const Kokkos::View< VScalar * , Space > & arg_y
+          )
+{
+  double result = 0 ;
+  Kokkos::parallel_reduce( Kokkos::RangePolicy<Space>(0,n) , Dot<VScalar,Space>( arg_x , arg_y ) , result );
+  return result ;
+}
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+struct CGSolveResult {
+  size_t  iteration ;
+  double  iter_time ;
+  double  matvec_time ;
+  double  norm_res ;
+};
+
+template< class ImportType
+        , typename MScalar
+        , typename VScalar
+        , class Space
+        >
+inline
+void cgsolve( const ImportType & import
+            , const CrsMatrix< MScalar , Space >      & A
+            , const Kokkos::View< VScalar * , Space > & b
+            , const Kokkos::View< VScalar * , Space > & x
+            , const size_t  maximum_iteration = 200
+            , const double  tolerance = std::numeric_limits<double>::epsilon()
+            , CGSolveResult * result = 0
+            )
+{
+  typedef View< VScalar * , Space >  VectorType ;
+
+  const size_t count_owned = import.count_owned ;
+  const size_t count_total = import.count_owned + import.count_receive;
+
+  size_t  iteration = 0 ;
+  double  iter_time = 0 ;
+  double  matvec_time = 0 ;
+  double  norm_res = 0 ;
+
+  // Need input vector to matvec to be owned + received
+  VectorType pAll ( "cg::p" , count_total );
+
+  VectorType p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
+  VectorType r ( "cg::r" , count_owned );
+  VectorType Ap( "cg::Ap", count_owned );
+
+  /* r = b - A * x ; */
+
+  /* p  = x       */  Kokkos::deep_copy( p , x );
+  /* import p     */  import( pAll );
+  /* Ap = A * p   */  multiply( count_owned , Ap , A , pAll );
+  /* r = b - Ap   */  waxpby( count_owned , r , 1.0 , b , -1.0 , Ap );
+  /* p  = r       */  Kokkos::deep_copy( p , r );
+
+  double old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
+
+  norm_res  = std::sqrt( old_rdot );
+  iteration = 0 ;
+
+  Kokkos::Timer wall_clock ;
+  Kokkos::Timer timer;
+
+  while ( tolerance < norm_res && iteration < maximum_iteration ) {
+
+    /* pAp_dot = dot( p , Ap = A * p ) */
+
+    timer.reset();
+    /* import p    */  import( pAll );
+    /* Ap = A * p  */  multiply( count_owned , Ap , A , pAll );
+    Space::fence();
+    matvec_time += timer.seconds();
+
+    const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm );
+    const double alpha   = old_rdot / pAp_dot ;
+
+    /* x +=  alpha * p ;  */ waxpby( count_owned , x ,  alpha, p  , 1.0 , x );
+    /* r += -alpha * Ap ; */ waxpby( count_owned , r , -alpha, Ap , 1.0 , r );
+
+    const double r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
+    const double beta  = r_dot / old_rdot ;
+
+    /* p = r + beta * p ; */ waxpby( count_owned , p , 1.0 , r , beta , p );
+
+    norm_res = std::sqrt( old_rdot = r_dot );
+
+    ++iteration ;
+  }
+
+  Space::fence();
+  iter_time = wall_clock.seconds();
+
+  if ( 0 != result ) {
+    result->iteration   = iteration ;
+    result->iter_time   = iter_time ;
+    result->matvec_time = matvec_time ;
+    result->norm_res    = norm_res ;
+  }
+}
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXAMPLE_CG_SOLVE */
+
+
diff --git a/packages/kokkos/example/fenl/CMakeLists.txt b/packages/kokkos/example/fenl/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..150656b16e13d4977c6ea975b87a785103cc7d48
--- /dev/null
+++ b/packages/kokkos/example/fenl/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../common)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../fixture)
+
+SET(SOURCES "")
+
+FILE( GLOB SOURCES *.cpp )
+
+LIST( APPEND SOURCES ../fixture/BoxElemPart.cpp )
+
+TRIBITS_ADD_EXECUTABLE(
+  fenl
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/fenl/Makefile b/packages/kokkos/example/fenl/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..24a0e61c18c4ce9efa1568534cfb4ad8bfccde9a
--- /dev/null
+++ b/packages/kokkos/example/fenl/Makefile
@@ -0,0 +1,50 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+vpath %.cpp ${SRC_DIR}/../fixture ${SRC_DIR}
+
+EXAMPLE_HEADERS = $(wildcard $(SRC_DIR)/../common/*.hpp ${SRC_DIR}/../fixture/*.hpp ${SRC_DIR}/*.hpp)
+
+default: build_all
+	echo "End Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+else
+  CXX = g++
+endif
+
+CXXFLAGS = -O3
+LINK ?= $(CXX)
+LDFLAGS ?=
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+KOKKOS_CXXFLAGS +=	\
+	-I${SRC_DIR}/../common	\
+	-I${SRC_DIR}/../fixture	\
+	-I${SRC_DIR}
+
+EXE_EXAMPLE_FENL = KokkosExample_Fenl
+OBJ_EXAMPLE_FENL = BoxElemPart.o main.o fenl.o
+
+TARGETS = $(EXE_EXAMPLE_FENL)
+
+#TEST_TARGETS =
+
+$(EXE_EXAMPLE_FENL) : $(OBJ_EXAMPLE_FENL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_EXAMPLE_FENL) $(KOKKOS_LIBS) $(LIB) -o $(EXE_EXAMPLE_FENL)
+
+build_all : $(TARGETS)
+
+test : build_all
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(EXAMPLE_HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/packages/kokkos/example/fenl/fenl.cpp b/packages/kokkos/example/fenl/fenl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1178ec6b9c9c551c5d6effda50b3ac7922fe1807
--- /dev/null
+++ b/packages/kokkos/example/fenl/fenl.cpp
@@ -0,0 +1,138 @@
+/*
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+*/
+
+#include <HexElement.hpp>
+#include <fenl_impl.hpp>
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+#if defined( KOKKOS_ENABLE_THREADS )
+
+template
+Perf fenl< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemLinear >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+
+template
+Perf fenl< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemQuadratic >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+#endif
+
+
+#if defined (KOKKOS_ENABLE_OPENMP)
+
+template
+Perf fenl< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemLinear >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+
+template
+Perf fenl< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemQuadratic >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+#endif
+
+#if defined( KOKKOS_ENABLE_CUDA )
+
+template
+Perf fenl< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemLinear >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+
+template
+Perf fenl< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemQuadratic >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+#endif
+
+#if defined( KOKKOS_ENABLE_ROCM )
+
+template
+Perf fenl< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemLinear >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+
+template
+Perf fenl< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemQuadratic >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+#endif
+
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos */
+
diff --git a/packages/kokkos/example/fenl/fenl.hpp b/packages/kokkos/example/fenl/fenl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f97a4c76b38ce3b9aa1528cd6665ce44e672ea67
--- /dev/null
+++ b/packages/kokkos/example/fenl/fenl.hpp
@@ -0,0 +1,89 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FENL_HPP
+#define KOKKOS_EXAMPLE_FENL_HPP
+
+#include <cstdlib>
+#include <BoxElemPart.hpp>
+#include <WrapMPI.hpp>
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+struct Perf {
+  size_t global_elem_count ;
+  size_t global_node_count ;
+  size_t newton_iter_count ;
+  size_t cg_iter_count ;
+  double map_ratio ;
+  double fill_node_set ;
+  double scan_node_count ;
+  double fill_graph_entries ;
+  double sort_graph_entries ;
+  double fill_element_graph ;
+  double create_sparse_matrix ;
+  double fill_time ;
+  double bc_time ;
+  double matvec_time ;
+  double cg_time ;
+  double newton_residual ;
+  double error_max ;
+
+};
+
+template < class Device , BoxElemPart::ElemOrder ElemOrder >
+Perf fenl(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FENL_HPP */
+
diff --git a/packages/kokkos/example/fenl/fenl_functors.hpp b/packages/kokkos/example/fenl/fenl_functors.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0b484b5fb8fc31bec626cf8e1a943f0dd2deac5
--- /dev/null
+++ b/packages/kokkos/example/fenl/fenl_functors.hpp
@@ -0,0 +1,1173 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FENLFUNCTORS_HPP
+#define KOKKOS_EXAMPLE_FENLFUNCTORS_HPP
+
+#include <cstdio>
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <cstdlib>
+#include <cmath>
+#include <limits>
+
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <BoxElemFixture.hpp>
+#include <HexElement.hpp>
+#include <CGSolve.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+template< class ElemNodeIdView , class CrsGraphType , unsigned ElemNode >
+class NodeNodeGraph {
+public:
+
+  typedef typename ElemNodeIdView::execution_space  execution_space ;
+  typedef pair<unsigned,unsigned> key_type ;
+
+  typedef Kokkos::UnorderedMap< key_type, void , execution_space >  SetType ;
+  typedef typename CrsGraphType::row_map_type::non_const_type       RowMapType ;
+  typedef Kokkos::View< unsigned ,  execution_space >               UnsignedValue ;
+
+  // Static dimensions of 0 generate compiler warnings or errors.
+  typedef Kokkos::View< unsigned*[ElemNode][ElemNode] , execution_space >
+    ElemGraphType ;
+
+  struct TagFillNodeSet {};
+  struct TagScanNodeCount {};
+  struct TagFillGraphEntries {};
+  struct TagSortGraphEntries {};
+  struct TagFillElementGraph {};
+
+private:
+
+  enum PhaseType { FILL_NODE_SET ,
+                   SCAN_NODE_COUNT ,
+                   FILL_GRAPH_ENTRIES ,
+                   SORT_GRAPH_ENTRIES ,
+                   FILL_ELEMENT_GRAPH };
+
+  const unsigned        node_count ;
+  const ElemNodeIdView  elem_node_id ;
+  UnsignedValue         row_total ;
+  RowMapType            row_count ;
+  RowMapType            row_map ;
+  SetType               node_node_set ;
+  PhaseType             phase ;
+
+public:
+
+  CrsGraphType          graph ;
+  ElemGraphType         elem_graph ;
+
+  struct Times
+  {
+    double ratio;
+    double fill_node_set;
+    double scan_node_count;
+    double fill_graph_entries;
+    double sort_graph_entries;
+    double fill_element_graph;
+  };
+
+  NodeNodeGraph( const ElemNodeIdView & arg_elem_node_id ,
+                 const unsigned         arg_node_count,
+                 Times & results
+               )
+    : node_count(arg_node_count)
+    , elem_node_id( arg_elem_node_id )
+    , row_total( "row_total" )
+    , row_count(Kokkos::ViewAllocateWithoutInitializing("row_count") , node_count ) // will deep_copy to 0 inside loop
+    , row_map( "graph_row_map" , node_count + 1 )
+    , node_node_set()
+    , phase( FILL_NODE_SET )
+    , graph()
+    , elem_graph()
+   {
+      //--------------------------------
+      // Guess at capacity required for the map:
+
+      Kokkos::Timer wall_clock ;
+
+      wall_clock.reset();
+      phase = FILL_NODE_SET ;
+
+      // upper bound on the capacity
+      size_t set_capacity = (28ull * node_count) / 2;
+      unsigned failed_insert_count = 0 ;
+
+      do {
+        // Zero the row count to restart the fill
+        Kokkos::deep_copy( row_count , 0u );
+
+        node_node_set = SetType( ( set_capacity += failed_insert_count ) );
+
+        // May be larger that requested:
+        set_capacity = node_node_set.capacity();
+
+        Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space,TagFillNodeSet>(0,elem_node_id.extent(0))
+                               , *this
+                               , failed_insert_count );
+
+      } while ( failed_insert_count );
+
+      execution_space::fence();
+      results.ratio = (double)node_node_set.size() / (double)node_node_set.capacity();
+      results.fill_node_set = wall_clock.seconds();
+      //--------------------------------
+
+      wall_clock.reset();
+      phase = SCAN_NODE_COUNT ;
+
+      // Exclusive scan of row_count into row_map
+      // including the final total in the 'node_count + 1' position.
+      // Zero the 'row_count' values.
+      Kokkos::parallel_scan( node_count , *this );
+
+      // Zero the row count for the fill:
+      Kokkos::deep_copy( row_count , 0u );
+
+      unsigned graph_entry_count = 0 ;
+
+      Kokkos::deep_copy( graph_entry_count , row_total );
+
+      // Assign graph's row_map and allocate graph's entries
+      graph.row_map = row_map ;
+      graph.entries = typename CrsGraphType::entries_type( "graph_entries" , graph_entry_count );
+
+      //--------------------------------
+      // Fill graph's entries from the (node,node) set.
+
+      execution_space::fence();
+      results.scan_node_count = wall_clock.seconds();
+
+      wall_clock.reset();
+      phase = FILL_GRAPH_ENTRIES ;
+      Kokkos::parallel_for( node_node_set.capacity() , *this );
+
+      execution_space::fence();
+      results.fill_graph_entries = wall_clock.seconds();
+
+      //--------------------------------
+      // Done with the temporary sets and arrays
+      wall_clock.reset();
+      phase = SORT_GRAPH_ENTRIES ;
+
+      row_total = UnsignedValue();
+      row_count = RowMapType();
+      row_map   = RowMapType();
+      node_node_set.clear();
+
+      //--------------------------------
+
+      Kokkos::parallel_for( node_count , *this );
+
+      execution_space::fence();
+      results.sort_graph_entries = wall_clock.seconds();
+
+      //--------------------------------
+      // Element-to-graph mapping:
+      wall_clock.reset();
+      phase = FILL_ELEMENT_GRAPH ;
+      elem_graph = ElemGraphType("elem_graph", elem_node_id.extent(0) );
+      Kokkos::parallel_for( elem_node_id.extent(0) , *this );
+
+      execution_space::fence();
+      results.fill_element_graph = wall_clock.seconds();
+    }
+
+  //------------------------------------
+  // parallel_for: create map and count row length
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagFillNodeSet & , unsigned ielem , unsigned & count ) const
+  {
+    // Loop over element's (row_local_node,col_local_node) pairs:
+    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {
+
+      const unsigned row_node = elem_node_id( ielem , row_local_node );
+
+      for ( unsigned col_local_node = row_local_node ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) {
+
+        const unsigned col_node = elem_node_id( ielem , col_local_node );
+
+        // If either node is locally owned then insert the pair into the unordered map:
+
+        if ( row_node < row_count.extent(0) || col_node < row_count.extent(0) ) {
+
+          const key_type key = (row_node < col_node) ? make_pair( row_node, col_node ) : make_pair( col_node, row_node ) ;
+
+          const typename SetType::insert_result result = node_node_set.insert( key );
+
+          // A successfull insert: the first time this pair was added
+          if ( result.success() ) {
+
+            // If row node is owned then increment count
+            if ( row_node < row_count.extent(0) ) { atomic_fetch_add( & row_count( row_node ) , 1 ); }
+
+            // If column node is owned and not equal to row node then increment count
+            if ( col_node < row_count.extent(0) && col_node != row_node ) { atomic_fetch_add( & row_count( col_node ) , 1 ); }
+          }
+          else if ( result.failed() ) {
+            ++count ;
+          }
+        }
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void fill_graph_entries( const unsigned iset ) const
+  {
+    if ( node_node_set.valid_at(iset) ) {
+      // Add each entry to the graph entries.
+
+      const key_type key = node_node_set.key_at(iset) ;
+      const unsigned row_node = key.first ;
+      const unsigned col_node = key.second ;
+
+      if ( row_node < row_count.extent(0) ) {
+        const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 );
+        graph.entries( offset ) = col_node ;
+      }
+
+      if ( col_node < row_count.extent(0) && col_node != row_node ) {
+        const unsigned offset = graph.row_map( col_node ) + atomic_fetch_add( & row_count( col_node ) , 1 );
+        graph.entries( offset ) = row_node ;
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void sort_graph_entries( const unsigned irow ) const
+  {
+    const unsigned row_beg = graph.row_map( irow );
+    const unsigned row_end = graph.row_map( irow + 1 );
+    for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) {
+      const unsigned col = graph.entries(i);
+      unsigned j = i ;
+      for ( ; row_beg < j && col < graph.entries(j-1) ; --j ) {
+        graph.entries(j) = graph.entries(j-1);
+      }
+      graph.entries(j) = col ;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void fill_elem_graph_map( const unsigned ielem ) const
+  {
+    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {
+
+      const unsigned row_node = elem_node_id( ielem , row_local_node );
+
+      for ( unsigned col_local_node = 0 ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) {
+
+        const unsigned col_node = elem_node_id( ielem , col_local_node );
+
+        unsigned entry = ~0u ;
+
+        if ( row_node + 1 < graph.row_map.extent(0) ) {
+
+          const unsigned entry_end = graph.row_map( row_node + 1 );
+
+          entry = graph.row_map( row_node );
+
+          for ( ; entry < entry_end && graph.entries(entry) != col_node ; ++entry );
+
+          if ( entry == entry_end ) entry = ~0u ;
+        }
+
+        elem_graph( ielem , row_local_node , col_local_node ) = entry ;
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned iwork ) const
+  {
+/*
+    if ( phase == FILL_NODE_SET ) {
+      operator()( TagFillNodeSet() , iwork );
+    }
+    else */
+    if ( phase == FILL_GRAPH_ENTRIES ) {
+      fill_graph_entries( iwork );
+    }
+    else if ( phase == SORT_GRAPH_ENTRIES ) {
+      sort_graph_entries( iwork );
+    }
+    else if ( phase == FILL_ELEMENT_GRAPH ) {
+      fill_elem_graph_map( iwork );
+    }
+  }
+
+  //------------------------------------
+  // parallel_scan: row offsets
+
+  typedef unsigned value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned irow , unsigned & update , const bool final ) const
+  {
+    // exclusive scan
+    if ( final ) { row_map( irow ) = update ; }
+
+    update += row_count( irow );
+
+    if ( final ) {
+      if ( irow + 1 == row_count.extent(0) ) {
+        row_map( irow + 1 ) = update ;
+        row_total()         = update ;
+      }
+    }
+  }
+
+  // For the reduce phase:
+  KOKKOS_INLINE_FUNCTION
+  void init( const TagFillNodeSet & , unsigned & update ) const { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( const TagFillNodeSet &
+           , volatile       unsigned & update
+           , volatile const unsigned & input ) const { update += input ; }
+
+  // For the scan phase::
+  KOKKOS_INLINE_FUNCTION
+  void init( unsigned & update ) const { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile       unsigned & update
+           , volatile const unsigned & input ) const { update += input ; }
+
+  //------------------------------------
+};
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos  */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+template< class ElemCompType >
+class NodeElemGatherFill {
+public:
+
+  typedef typename ElemCompType::execution_space         execution_space ;
+  typedef typename ElemCompType::vector_type         vector_type ;
+  typedef typename ElemCompType::sparse_matrix_type  sparse_matrix_type ;
+  typedef typename ElemCompType::elem_node_type      elem_node_type ;
+  typedef typename ElemCompType::elem_vectors_type   elem_vectors_type ;
+  typedef typename ElemCompType::elem_matrices_type  elem_matrices_type ;
+  typedef typename ElemCompType::elem_graph_type     elem_graph_type ;
+
+  static const unsigned ElemNodeCount = ElemCompType::ElemNodeCount ;
+
+  //------------------------------------
+
+private:
+
+  typedef Kokkos::StaticCrsGraph< unsigned[2] , execution_space >  CrsGraphType ;
+  typedef typename CrsGraphType::row_map_type::non_const_type  RowMapType ;
+  typedef Kokkos::View< unsigned ,  execution_space >              UnsignedValue ;
+
+  enum PhaseType { FILL_NODE_COUNT ,
+                   SCAN_NODE_COUNT ,
+                   FILL_GRAPH_ENTRIES ,
+                   SORT_GRAPH_ENTRIES ,
+                   GATHER_FILL };
+
+  const elem_node_type  elem_node_id ;
+  const elem_graph_type elem_graph ;
+  UnsignedValue         row_total ;
+  RowMapType            row_count ;
+  RowMapType            row_map ;
+  CrsGraphType          graph ;
+  vector_type           residual ;
+  sparse_matrix_type    jacobian ;
+  elem_vectors_type     elem_residual ;
+  elem_matrices_type    elem_jacobian ;
+  PhaseType             phase ;
+
+public:
+
+  NodeElemGatherFill()
+    : elem_node_id()
+    , elem_graph()
+    , row_total()
+    , row_count()
+    , row_map()
+    , graph()
+    , residual()
+    , jacobian()
+    , elem_residual()
+    , elem_jacobian()
+    , phase( FILL_NODE_COUNT )
+    {}
+
+  NodeElemGatherFill( const NodeElemGatherFill & rhs )
+    : elem_node_id(  rhs.elem_node_id )
+    , elem_graph(    rhs.elem_graph )
+    , row_total(     rhs.row_total )
+    , row_count(     rhs.row_count )
+    , row_map(       rhs.row_map )
+    , graph(         rhs.graph )
+    , residual(      rhs.residual )
+    , jacobian(      rhs.jacobian )
+    , elem_residual( rhs.elem_residual )
+    , elem_jacobian( rhs.elem_jacobian )
+    , phase(         rhs.phase )
+    {}
+
+  NodeElemGatherFill( const elem_node_type     & arg_elem_node_id ,
+                      const elem_graph_type    & arg_elem_graph ,
+                      const vector_type        & arg_residual ,
+                      const sparse_matrix_type & arg_jacobian ,
+                      const elem_vectors_type  & arg_elem_residual ,
+                      const elem_matrices_type & arg_elem_jacobian )
+    : elem_node_id( arg_elem_node_id )
+    , elem_graph( arg_elem_graph )
+    , row_total( "row_total" )
+    , row_count( "row_count" , arg_residual.extent(0) )
+    , row_map( "graph_row_map" , arg_residual.extent(0) + 1 )
+    , graph()
+    , residual( arg_residual )
+    , jacobian( arg_jacobian )
+    , elem_residual( arg_elem_residual )
+    , elem_jacobian( arg_elem_jacobian )
+    , phase( FILL_NODE_COUNT )
+    {
+      //--------------------------------
+      // Count node->element relations
+
+      phase = FILL_NODE_COUNT ;
+
+      Kokkos::parallel_for( elem_node_id.extent(0) , *this );
+
+      //--------------------------------
+
+      phase = SCAN_NODE_COUNT ;
+
+      // Exclusive scan of row_count into row_map
+      // including the final total in the 'node_count + 1' position.
+      // Zero the 'row_count' values.
+      Kokkos::parallel_scan( residual.extent(0) , *this );
+
+      // Zero the row count for the fill:
+      Kokkos::deep_copy( row_count , typename RowMapType::value_type(0) );
+
+      unsigned graph_entry_count = 0 ;
+
+      Kokkos::deep_copy( graph_entry_count , row_total );
+
+      // Assign graph's row_map and allocate graph's entries
+      graph.row_map = row_map ;
+
+      typedef typename CrsGraphType::entries_type graph_entries_type ;
+
+      graph.entries = graph_entries_type( "graph_entries" , graph_entry_count );
+
+      //--------------------------------
+      // Fill graph's entries from the (node,node) set.
+
+      phase = FILL_GRAPH_ENTRIES ;
+
+      Kokkos::deep_copy( row_count , 0u );
+      Kokkos::parallel_for( elem_node_id.extent(0) , *this );
+
+      execution_space::fence();
+
+      //--------------------------------
+      // Done with the temporary sets and arrays
+
+      row_total = UnsignedValue();
+      row_count = RowMapType();
+      row_map   = RowMapType();
+
+      //--------------------------------
+
+      phase = SORT_GRAPH_ENTRIES ;
+      Kokkos::parallel_for( residual.extent(0) , *this );
+
+      execution_space::fence();
+
+      phase = GATHER_FILL ;
+    }
+
+  void apply() const
+  {
+    Kokkos::parallel_for( residual.extent(0) , *this );
+  }
+
+  //------------------------------------
+  //------------------------------------
+  // parallel_for: Count node->element pairs
+
+  KOKKOS_INLINE_FUNCTION
+  void fill_node_count( const unsigned ielem ) const
+  {
+    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {
+
+      const unsigned row_node = elem_node_id( ielem , row_local_node );
+
+      if ( row_node < row_count.extent(0) ) {
+        atomic_fetch_add( & row_count( row_node ) , 1 );
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void fill_graph_entries( const unsigned ielem ) const
+  {
+    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {
+
+      const unsigned row_node = elem_node_id( ielem , row_local_node );
+
+      if ( row_node < row_count.extent(0) ) {
+
+        const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 );
+
+        graph.entries( offset , 0 ) = ielem ;
+        graph.entries( offset , 1 ) = row_local_node ;
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void sort_graph_entries( const unsigned irow ) const
+  {
+    const unsigned row_beg = graph.row_map( irow );
+    const unsigned row_end = graph.row_map( irow + 1 );
+    for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) {
+      const unsigned elem  = graph.entries(i,0);
+      const unsigned local = graph.entries(i,1);
+      unsigned j = i ;
+      for ( ; row_beg < j && elem < graph.entries(j-1,0) ; --j ) {
+        graph.entries(j,0) = graph.entries(j-1,0);
+        graph.entries(j,1) = graph.entries(j-1,1);
+      }
+      graph.entries(j,0) = elem ;
+      graph.entries(j,1) = local ;
+    }
+  }
+
+  //------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void gather_fill( const unsigned irow ) const
+  {
+    const unsigned node_elem_begin = graph.row_map(irow);
+    const unsigned node_elem_end   = graph.row_map(irow+1);
+
+    //  for each element that a node belongs to
+
+    for ( unsigned i = node_elem_begin ; i < node_elem_end ; i++ ) {
+
+      const unsigned elem_id   = graph.entries( i, 0);
+      const unsigned row_index = graph.entries( i, 1);
+
+      residual(irow) += elem_residual(elem_id, row_index);
+
+      //  for each node in a particular related element
+      //  gather the contents of the element stiffness
+      //  matrix that belong in irow
+
+      for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
+        const unsigned A_index = elem_graph( elem_id , row_index , j );
+
+        jacobian.coeff( A_index ) += elem_jacobian( elem_id, row_index, j );
+      }
+    }
+  }
+
+  //------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned iwork ) const
+  {
+    if ( phase == FILL_NODE_COUNT ) {
+      fill_node_count( iwork );
+    }
+    else if ( phase == FILL_GRAPH_ENTRIES ) {
+      fill_graph_entries( iwork );
+    }
+    else if ( phase == SORT_GRAPH_ENTRIES ) {
+      sort_graph_entries( iwork );
+    }
+    else if ( phase == GATHER_FILL ) {
+      gather_fill( iwork );
+    }
+  }
+
+  //------------------------------------
+  // parallel_scan: row offsets
+
+  typedef unsigned value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned irow , unsigned & update , const bool final ) const
+  {
+    // exclusive scan
+    if ( final ) { row_map( irow ) = update ; }
+
+    update += row_count( irow );
+
+    if ( final ) {
+      if ( irow + 1 == row_count.extent(0) ) {
+        row_map( irow + 1 ) = update ;
+        row_total()         = update ;
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( unsigned & update ) const { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile unsigned & update , const volatile unsigned & input ) const { update += input ; }
+};
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos  */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+template< class FiniteElementMeshType , class SparseMatrixType >
+class ElementComputation ;
+
+
+template< class ExecSpace , BoxElemPart::ElemOrder Order , class CoordinateMap , typename ScalarType >
+class ElementComputation<
+  Kokkos::Example::BoxElemFixture< ExecSpace , Order , CoordinateMap > ,
+  Kokkos::Example::CrsMatrix< ScalarType , ExecSpace > >
+{
+public:
+
+  typedef Kokkos::Example::BoxElemFixture< ExecSpace, Order, CoordinateMap >  mesh_type ;
+  typedef Kokkos::Example::HexElement_Data< mesh_type::ElemNode >             element_data_type ;
+
+  typedef Kokkos::Example::CrsMatrix< ScalarType , ExecSpace >  sparse_matrix_type ;
+  typedef typename sparse_matrix_type::StaticCrsGraphType       sparse_graph_type ;
+
+  typedef ExecSpace   execution_space ;
+  typedef ScalarType  scalar_type ;
+
+  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
+  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
+  static const unsigned ElemNodeCount    = element_data_type::element_node_count ;
+  static const unsigned FunctionCount    = element_data_type::function_count ;
+  static const unsigned IntegrationCount = element_data_type::integration_count ;
+
+  //------------------------------------
+
+  typedef typename mesh_type::node_coord_type                                      node_coord_type ;
+  typedef typename mesh_type::elem_node_type                                       elem_node_type ;
+  typedef Kokkos::View< scalar_type*[FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
+  typedef Kokkos::View< scalar_type*[FunctionCount] ,                execution_space > elem_vectors_type ;
+  typedef Kokkos::View< scalar_type* ,                               execution_space > vector_type ;
+
+  typedef typename NodeNodeGraph< elem_node_type , sparse_graph_type , ElemNodeCount >::ElemGraphType elem_graph_type ;
+
+  //------------------------------------
+
+
+  //------------------------------------
+  // Computational data:
+
+  const element_data_type   elem_data ;
+  const elem_node_type      elem_node_ids ;
+  const node_coord_type     node_coords ;
+  const elem_graph_type     elem_graph ;
+  const elem_matrices_type  elem_jacobians ;
+  const elem_vectors_type   elem_residuals ;
+  const vector_type         solution ;
+  const vector_type         residual ;
+  const sparse_matrix_type  jacobian ;
+  const scalar_type         coeff_K ;
+
+  ElementComputation( const ElementComputation & rhs )
+    : elem_data()
+    , elem_node_ids( rhs.elem_node_ids )
+    , node_coords(   rhs.node_coords )
+    , elem_graph(    rhs.elem_graph )
+    , elem_jacobians( rhs.elem_jacobians )
+    , elem_residuals( rhs.elem_residuals )
+    , solution( rhs.solution )
+    , residual( rhs.residual )
+    , jacobian( rhs.jacobian )
+    , coeff_K( rhs.coeff_K )
+    {}
+
+  // If the element->sparse_matrix graph is provided then perform atomic updates
+  // Otherwise fill per-element contributions for subequent gather-add into a residual and jacobian.
+  ElementComputation( const mesh_type          & arg_mesh ,
+	              const scalar_type          arg_coeff_K ,
+                      const vector_type        & arg_solution ,
+                      const elem_graph_type    & arg_elem_graph ,
+                      const sparse_matrix_type & arg_jacobian ,
+                      const vector_type        & arg_residual )
+    : elem_data()
+    , elem_node_ids( arg_mesh.elem_node() )
+    , node_coords(   arg_mesh.node_coord() )
+    , elem_graph(    arg_elem_graph )
+    , elem_jacobians()
+    , elem_residuals()
+    , solution( arg_solution )
+    , residual( arg_residual )
+    , jacobian( arg_jacobian )
+    , coeff_K( arg_coeff_K )
+    {}
+
+  ElementComputation( const mesh_type    & arg_mesh ,
+	              const scalar_type    arg_coeff_K ,
+                      const vector_type  & arg_solution )
+    : elem_data()
+    , elem_node_ids( arg_mesh.elem_node() )
+    , node_coords(   arg_mesh.node_coord() )
+    , elem_graph()
+    , elem_jacobians( "elem_jacobians" , arg_mesh.elem_count() )
+    , elem_residuals( "elem_residuals" , arg_mesh.elem_count() )
+    , solution( arg_solution )
+    , residual()
+    , jacobian()
+    , coeff_K( arg_coeff_K )
+    {}
+
+  //------------------------------------
+
+  void apply() const
+  {
+    parallel_for( elem_node_ids.extent(0) , *this );
+  }
+
+  //------------------------------------
+
+  static const unsigned FLOPS_transform_gradients =
+     /* Jacobian */           FunctionCount * TensorDim * 2 +
+     /* Inverse jacobian */   TensorDim * 6 + 6 +
+     /* Gradient transform */ FunctionCount * 15 ;
+
+  KOKKOS_INLINE_FUNCTION
+  float transform_gradients(
+    const float grad[][ FunctionCount ] , // Gradient of bases master element
+    const double x[] ,
+    const double y[] ,
+    const double z[] ,
+    float dpsidx[] ,
+    float dpsidy[] ,
+    float dpsidz[] ) const
+  {
+    enum { j11 = 0 , j12 = 1 , j13 = 2 ,
+           j21 = 3 , j22 = 4 , j23 = 5 ,
+           j31 = 6 , j32 = 7 , j33 = 8 };
+
+    // Jacobian accumulation:
+
+    double J[ TensorDim ] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
+
+    for( unsigned i = 0; i < FunctionCount ; ++i ) {
+      const double x1 = x[i] ;
+      const double x2 = y[i] ;
+      const double x3 = z[i] ;
+
+      const float g1 = grad[0][i] ;
+      const float g2 = grad[1][i] ;
+      const float g3 = grad[2][i] ;
+
+      J[j11] += g1 * x1 ;
+      J[j12] += g1 * x2 ;
+      J[j13] += g1 * x3 ;
+
+      J[j21] += g2 * x1 ;
+      J[j22] += g2 * x2 ;
+      J[j23] += g2 * x3 ;
+
+      J[j31] += g3 * x1 ;
+      J[j32] += g3 * x2 ;
+      J[j33] += g3 * x3 ;
+    }
+
+    // Inverse jacobian:
+
+    float invJ[ TensorDim ] = {
+      static_cast<float>( J[j22] * J[j33] - J[j23] * J[j32] ) ,
+      static_cast<float>( J[j13] * J[j32] - J[j12] * J[j33] ) ,
+      static_cast<float>( J[j12] * J[j23] - J[j13] * J[j22] ) ,
+
+      static_cast<float>( J[j23] * J[j31] - J[j21] * J[j33] ) ,
+      static_cast<float>( J[j11] * J[j33] - J[j13] * J[j31] ) ,
+      static_cast<float>( J[j13] * J[j21] - J[j11] * J[j23] ) ,
+
+      static_cast<float>( J[j21] * J[j32] - J[j22] * J[j31] ) ,
+      static_cast<float>( J[j12] * J[j31] - J[j11] * J[j32] ) ,
+      static_cast<float>( J[j11] * J[j22] - J[j12] * J[j21] ) };
+
+    const float detJ = J[j11] * invJ[j11] +
+                       J[j21] * invJ[j12] +
+                       J[j31] * invJ[j13] ;
+
+    const float detJinv = 1.0 / detJ ;
+
+    for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }
+
+    // Transform gradients:
+
+    for( unsigned i = 0; i < FunctionCount ; ++i ) {
+      const float g0 = grad[0][i];
+      const float g1 = grad[1][i];
+      const float g2 = grad[2][i];
+
+      dpsidx[i] = g0 * invJ[j11] + g1 * invJ[j12] + g2 * invJ[j13];
+      dpsidy[i] = g0 * invJ[j21] + g1 * invJ[j22] + g2 * invJ[j23];
+      dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33];
+    }
+
+    return detJ ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void contributeResidualJacobian(
+    const float coeff_k ,
+    const double dof_values[] ,
+    const float dpsidx[] ,
+    const float dpsidy[] ,
+    const float dpsidz[] ,
+    const float detJ ,
+    const float integ_weight ,
+    const float bases_vals[] ,
+    double elem_res[] ,
+    double elem_mat[][ FunctionCount ] ) const
+  {
+    double value_at_pt = 0 ;
+    double gradx_at_pt = 0 ;
+    double grady_at_pt = 0 ;
+    double gradz_at_pt = 0 ;
+
+    for ( unsigned m = 0 ; m < FunctionCount ; m++ ) {
+      value_at_pt += dof_values[m] * bases_vals[m] ;
+      gradx_at_pt += dof_values[m] * dpsidx[m] ;
+      grady_at_pt += dof_values[m] * dpsidy[m] ;
+      gradz_at_pt += dof_values[m] * dpsidz[m] ;
+    }
+
+    const scalar_type k_detJ_weight = coeff_k        * detJ * integ_weight ;
+    const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ;
+    const double mat_val = 2.0 * value_at_pt         * detJ * integ_weight ;
+
+    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$
+    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$
+
+    for ( unsigned m = 0; m < FunctionCount; ++m) {
+      double * const mat = elem_mat[m] ;
+      const float bases_val_m = bases_vals[m];
+      const float dpsidx_m    = dpsidx[m] ;
+      const float dpsidy_m    = dpsidy[m] ;
+      const float dpsidz_m    = dpsidz[m] ;
+
+      elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt +
+                                       dpsidy_m * grady_at_pt +
+                                       dpsidz_m * gradz_at_pt ) +
+                     res_val * bases_val_m ;
+
+      for( unsigned n = 0; n < FunctionCount; n++) {
+
+        mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] +
+                                    dpsidy_m * dpsidy[n] +
+                                    dpsidz_m * dpsidz[n] ) +
+                  mat_val * bases_val_m * bases_vals[n];
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned ielem ) const
+  {
+    // Gather nodal coordinates and solution vector:
+
+    double x[ FunctionCount ] ;
+    double y[ FunctionCount ] ;
+    double z[ FunctionCount ] ;
+    double val[ FunctionCount ] ;
+    unsigned node_index[ ElemNodeCount ];
+
+    for ( unsigned i = 0 ; i < ElemNodeCount ; ++i ) {
+      const unsigned ni = elem_node_ids( ielem , i );
+
+      node_index[i] = ni ;
+
+      x[i] = node_coords( ni , 0 );
+      y[i] = node_coords( ni , 1 );
+      z[i] = node_coords( ni , 2 );
+
+      val[i] = solution( ni );
+    }
+
+
+    double elem_vec[ FunctionCount ] ;
+    double elem_mat[ FunctionCount ][ FunctionCount ] ;
+
+    for( unsigned i = 0; i < FunctionCount ; i++ ) {
+      elem_vec[i] = 0 ;
+      for( unsigned j = 0; j < FunctionCount ; j++){
+        elem_mat[i][j] = 0 ;
+      }
+    }
+
+
+    for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) {
+      float dpsidx[ FunctionCount ] ;
+      float dpsidy[ FunctionCount ] ;
+      float dpsidz[ FunctionCount ] ;
+
+      const float detJ =
+        transform_gradients( elem_data.gradients[i] , x , y , z ,
+                             dpsidx , dpsidy , dpsidz );
+
+      contributeResidualJacobian( coeff_K ,
+                                  val , dpsidx , dpsidy , dpsidz ,
+                                  detJ ,
+                                  elem_data.weights[i] ,
+                                  elem_data.values[i] ,
+                                  elem_vec , elem_mat );
+    }
+
+#if 0
+
+if ( 1 == ielem ) {
+  printf("ElemResidual { %f %f %f %f %f %f %f %f }\n",
+         elem_vec[0], elem_vec[1], elem_vec[2], elem_vec[3],
+         elem_vec[4], elem_vec[5], elem_vec[6], elem_vec[7]);
+
+  printf("ElemJacobian {\n");
+
+  for ( unsigned j = 0 ; j < FunctionCount ; ++j ) {
+  printf("  { %f %f %f %f %f %f %f %f }\n",
+         elem_mat[j][0], elem_mat[j][1], elem_mat[j][2], elem_mat[j][3],
+         elem_mat[j][4], elem_mat[j][5], elem_mat[j][6], elem_mat[j][7]);
+  }
+  printf("}\n");
+}
+
+#endif
+
+    if ( ! residual.extent(0) ) {
+      for( unsigned i = 0; i < FunctionCount ; i++){
+        elem_residuals(ielem, i) = elem_vec[i] ;
+        for( unsigned j = 0; j < FunctionCount ; j++){
+          elem_jacobians(ielem, i, j) = elem_mat[i][j] ;
+        }
+      }
+    }
+    else {
+      for( unsigned i = 0 ; i < FunctionCount ; i++ ) {
+        const unsigned row = node_index[i] ;
+        if ( row < residual.extent(0) ) {
+          atomic_fetch_add( & residual( row ) , elem_vec[i] );
+
+          for( unsigned j = 0 ; j < FunctionCount ; j++ ) {
+            const unsigned entry = elem_graph( ielem , i , j );
+            if ( entry != ~0u ) {
+              atomic_fetch_add( & jacobian.coeff( entry ) , elem_mat[i][j] );
+            }
+          }
+        }
+      }
+    }
+  }
+}; /* ElementComputation */
+
+//----------------------------------------------------------------------------
+
+template< class FixtureType , class SparseMatrixType >
+class DirichletComputation ;
+
+template< class ExecSpace , BoxElemPart::ElemOrder Order , class CoordinateMap , typename ScalarType >
+class DirichletComputation<
+  Kokkos::Example::BoxElemFixture< ExecSpace , Order , CoordinateMap > ,
+  Kokkos::Example::CrsMatrix< ScalarType , ExecSpace > >
+{
+public:
+
+  typedef Kokkos::Example::BoxElemFixture< ExecSpace, Order, CoordinateMap >  mesh_type ;
+  typedef typename mesh_type::node_coord_type                                 node_coord_type ;
+  typedef typename node_coord_type::value_type                                scalar_coord_type ;
+
+  typedef Kokkos::Example::CrsMatrix< ScalarType , ExecSpace >  sparse_matrix_type ;
+  typedef typename sparse_matrix_type::StaticCrsGraphType       sparse_graph_type ;
+
+  typedef ExecSpace   execution_space ;
+  typedef ScalarType  scalar_type ;
+
+  //------------------------------------
+
+  typedef Kokkos::View< scalar_type* , execution_space > vector_type ;
+
+  //------------------------------------
+  // Computational data:
+
+  const node_coord_type     node_coords ;
+  const vector_type         solution ;
+  const sparse_matrix_type  jacobian ;
+  const vector_type         residual ;
+  const scalar_type         bc_lower_value ;
+  const scalar_type         bc_upper_value ;
+  const scalar_coord_type   bc_lower_limit ;
+  const scalar_coord_type   bc_upper_limit ;
+  const unsigned            bc_plane ;
+  const unsigned            node_count ;
+        bool                init ;
+
+
+  DirichletComputation( const mesh_type          & arg_mesh ,
+                        const vector_type        & arg_solution ,
+                        const sparse_matrix_type & arg_jacobian ,
+                        const vector_type        & arg_residual ,
+                        const unsigned             arg_bc_plane ,
+                        const scalar_type          arg_bc_lower_value ,
+                        const scalar_type          arg_bc_upper_value )
+    : node_coords( arg_mesh.node_coord() )
+    , solution(    arg_solution )
+    , jacobian(    arg_jacobian )
+    , residual(    arg_residual )
+    , bc_lower_value( arg_bc_lower_value )
+    , bc_upper_value( arg_bc_upper_value )
+    , bc_lower_limit( std::numeric_limits<scalar_coord_type>::epsilon() )
+    , bc_upper_limit( scalar_coord_type(1) - std::numeric_limits<scalar_coord_type>::epsilon() )
+    , bc_plane(       arg_bc_plane )
+    , node_count( arg_mesh.node_count_owned() )
+    , init( false )
+    {
+      parallel_for( node_count , *this );
+      init = true ;
+    }
+
+  void apply() const
+  {
+    parallel_for( node_count , *this );
+  }
+
+  //------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned inode ) const
+  {
+    //  Apply dirichlet boundary condition on the Solution and Residual vectors.
+    //  To maintain the symmetry of the original global stiffness matrix,
+    //  zero out the columns that correspond to boundary conditions, and
+    //  update the residual vector accordingly
+
+    const unsigned iBeg = jacobian.graph.row_map[inode];
+    const unsigned iEnd = jacobian.graph.row_map[inode+1];
+
+    const scalar_coord_type c = node_coords(inode,bc_plane);
+    const bool bc_lower = c <= bc_lower_limit ;
+    const bool bc_upper = bc_upper_limit <= c ;
+
+    if ( ! init ) {
+      solution(inode) = bc_lower ? bc_lower_value : (
+                        bc_upper ? bc_upper_value : 0 );
+    }
+    else {
+      if ( bc_lower || bc_upper ) {
+
+        residual(inode) = 0 ;
+
+        //  zero each value on the row, and leave a one
+        //  on the diagonal
+
+        for( unsigned i = iBeg ; i < iEnd ; ++i ) {
+          jacobian.coeff(i) = int(inode) == int(jacobian.graph.entries(i)) ? 1 : 0 ;
+        }
+      }
+      else {
+
+        //  Find any columns that are boundary conditions.
+        //  Clear them and adjust the residual vector
+
+        for( unsigned i = iBeg ; i < iEnd ; ++i ) {
+          const unsigned       cnode = jacobian.graph.entries(i) ;
+          const scalar_coord_type cc = node_coords(cnode,bc_plane);
+
+          if ( ( cc <= bc_lower_limit ) || ( bc_upper_limit <= cc ) ) {
+            jacobian.coeff(i) = 0 ;
+          }
+        }
+      }
+    }
+  }
+};
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos  */
+
+//----------------------------------------------------------------------------
+
+/* A Cuda-specific specialization for the element computation functor. */
+#if defined( __CUDACC__ )
+// #include <NonlinearElement_Cuda.hpp>
+#endif
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FENLFUNCTORS_HPP */
+
diff --git a/packages/kokkos/example/fenl/fenl_impl.hpp b/packages/kokkos/example/fenl/fenl_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1996a79729a72004a5ea0d3d7b04d9d3e9d5243d
--- /dev/null
+++ b/packages/kokkos/example/fenl/fenl_impl.hpp
@@ -0,0 +1,598 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FENL_IMPL_HPP
+#define KOKKOS_EXAMPLE_FENL_IMPL_HPP
+
+#include <cmath>
+
+// Kokkos libraries' headers:
+
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_StaticCrsGraph.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+// Examples headers:
+
+#include <BoxElemFixture.hpp>
+#include <VectorImport.hpp>
+#include <CGSolve.hpp>
+
+#include <fenl.hpp>
+#include <fenl_functors.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+inline
+double maximum( MPI_Comm comm , double local )
+{
+  double global = local ;
+#if defined( KOKKOS_ENABLE_MPI )
+  MPI_Allreduce( & local , & global , 1 , MPI_DOUBLE , MPI_MAX , comm );
+#endif
+  return global ;
+}
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+class ManufacturedSolution {
+public:
+
+  // Manufactured solution for one dimensional nonlinear PDE
+  //
+  //  -K T_zz + T^2 = 0 ; T(zmin) = T_zmin ; T(zmax) = T_zmax
+  //
+  //  Has an analytic solution of the form:
+  //
+  //    T(z) = ( a ( z - zmin ) + b )^(-2) where K = 1 / ( 6 a^2 )
+  //
+  //  Given T_0 and T_L compute K for this analytic solution.
+  //
+  //  Two analytic solutions:
+  //
+  //    Solution with singularity:
+  //    , a( ( 1.0 / sqrt(T_zmax) + 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
+  //    , b( -1.0 / sqrt(T_zmin) )
+  //
+  //    Solution without singularity:
+  //    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
+  //    , b( 1.0 / sqrt(T_zmin) )
+
+  const double zmin ;
+  const double zmax ;
+  const double T_zmin ;
+  const double T_zmax ;
+  const double a ;
+  const double b ;
+  const double K ;
+
+  ManufacturedSolution( const double arg_zmin ,
+                        const double arg_zmax ,
+                        const double arg_T_zmin ,
+                        const double arg_T_zmax )
+    : zmin( arg_zmin )
+    , zmax( arg_zmax )
+    , T_zmin( arg_T_zmin )
+    , T_zmax( arg_T_zmax )
+    , a( ( 1.0 / std::sqrt(T_zmax) - 1.0 / std::sqrt(T_zmin) ) / ( zmax - zmin ) )
+    , b( 1.0 / std::sqrt(T_zmin) )
+    , K( 1.0 / ( 6.0 * a * a ) )
+    {}
+
+  double operator()( const double z ) const
+  {
+    const double tmp = a * ( z - zmin ) + b ;
+    return 1.0 / ( tmp * tmp );
+  }
+};
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+template < class Space , BoxElemPart::ElemOrder ElemOrder >
+Perf fenl(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int use_elems[] )
+{
+  typedef Kokkos::Example::BoxElemFixture< Space , ElemOrder > FixtureType ;
+
+  typedef Kokkos::Example::CrsMatrix< double , Space >
+    SparseMatrixType ;
+
+  typedef typename SparseMatrixType::StaticCrsGraphType
+    SparseGraphType ;
+
+  typedef Kokkos::Example::FENL::NodeNodeGraph< typename FixtureType::elem_node_type , SparseGraphType , FixtureType::ElemNode >
+     NodeNodeGraphType ;
+
+  typedef Kokkos::Example::FENL::ElementComputation< FixtureType , SparseMatrixType >
+    ElementComputationType ;
+
+  typedef Kokkos::Example::FENL::DirichletComputation< FixtureType , SparseMatrixType >
+    DirichletComputationType ;
+
+  typedef NodeElemGatherFill< ElementComputationType >
+    NodeElemGatherFillType ;
+
+  typedef typename ElementComputationType::vector_type VectorType ;
+
+  typedef Kokkos::Example::VectorImport<
+     typename FixtureType::comm_list_type ,
+     typename FixtureType::send_nodeid_type ,
+     VectorType > ImportType ;
+
+  //------------------------------------
+
+  const unsigned newton_iteration_limit     = 10 ;
+  const double   newton_iteration_tolerance = 1e-7 ;
+  const unsigned cg_iteration_limit         = 200 ;
+  const double   cg_iteration_tolerance     = 1e-7 ;
+
+  //------------------------------------
+
+  const int print_flag = use_print && std::is_same< Kokkos::HostSpace , typename Space::memory_space >::value ;
+
+  int comm_rank ;
+  int comm_size ;
+
+  MPI_Comm_rank( comm , & comm_rank );
+  MPI_Comm_size( comm , & comm_size );
+
+  // Decompose by node to avoid mpi-communication for assembly
+
+  const float bubble_x = 1.0 ;
+  const float bubble_y = 1.0 ;
+  const float bubble_z = 1.0 ;
+
+  const FixtureType fixture( BoxElemPart::DecomposeNode , comm_size , comm_rank ,
+                             use_elems[0] , use_elems[1] , use_elems[2] ,
+                             bubble_x , bubble_y , bubble_z );
+
+
+  {
+    int global_error = ! fixture.ok();
+
+#if defined( KOKKOS_ENABLE_MPI )
+    int local_error = global_error ;
+    global_error = 0 ;
+    MPI_Allreduce( & local_error , & global_error , 1 , MPI_INT , MPI_SUM , comm );
+#endif
+
+    if ( global_error ) {
+      throw std::runtime_error(std::string("Error generating finite element fixture"));
+    }
+  }
+
+  //------------------------------------
+
+  const ImportType comm_nodal_import(
+    comm ,
+    fixture.recv_node() ,
+    fixture.send_node() ,
+    fixture.send_nodeid() ,
+    fixture.node_count_owned() ,
+    fixture.node_count() - fixture.node_count_owned() );
+
+  //------------------------------------
+
+  const double bc_lower_value = 1 ;
+  const double bc_upper_value = 2 ;
+
+  const Kokkos::Example::FENL::ManufacturedSolution
+    manufactured_solution( 0 , 1 , bc_lower_value , bc_upper_value  );
+
+  //------------------------------------
+
+  for ( int k = 0 ; k < comm_size && use_print ; ++k ) {
+    if ( k == comm_rank ) {
+      typename FixtureType::node_grid_type::HostMirror
+        h_node_grid = Kokkos::create_mirror_view( fixture.node_grid() );
+
+      typename FixtureType::node_coord_type::HostMirror
+        h_node_coord = Kokkos::create_mirror_view( fixture.node_coord() );
+
+      typename FixtureType::elem_node_type::HostMirror
+        h_elem_node = Kokkos::create_mirror_view( fixture.elem_node() );
+
+      Kokkos::deep_copy( h_node_grid , fixture.node_grid() );
+      Kokkos::deep_copy( h_node_coord , fixture.node_coord() );
+      Kokkos::deep_copy( h_elem_node , fixture.elem_node() );
+
+      std::cout << "MPI[" << comm_rank << "]" << std::endl ;
+      std::cout << "Node grid {" ;
+      for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) {
+        std::cout << " (" << h_node_grid(inode,0)
+                  << "," << h_node_grid(inode,1)
+                  << "," << h_node_grid(inode,2)
+                  << ")" ;
+      }
+      std::cout << " }" << std::endl ;
+
+      std::cout << "Node coord {" ;
+      for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) {
+        std::cout << " (" << h_node_coord(inode,0)
+                  << "," << h_node_coord(inode,1)
+                  << "," << h_node_coord(inode,2)
+                  << ")" ;
+      }
+      std::cout << " }" << std::endl ;
+
+      std::cout << "Manufactured solution"
+                << " a[" << manufactured_solution.a << "]"
+                << " b[" << manufactured_solution.b << "]"
+                << " K[" << manufactured_solution.K << "]"
+                << " {" ;
+      for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) {
+        std::cout << " " << manufactured_solution( h_node_coord( inode , 2 ) );
+      }
+      std::cout << " }" << std::endl ;
+
+      std::cout << "ElemNode {" << std::endl ;
+      for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) {
+        std::cout << "  elem[" << ielem << "]{" ;
+        for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
+          std::cout << " " << h_elem_node(ielem,inode);
+        }
+        std::cout << " }{" ;
+        for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
+          std::cout << " (" << h_node_grid(h_elem_node(ielem,inode),0)
+                    << "," << h_node_grid(h_elem_node(ielem,inode),1)
+                    << "," << h_node_grid(h_elem_node(ielem,inode),2)
+                    << ")" ;
+        }
+        std::cout << " }" << std::endl ;
+      }
+      std::cout << "}" << std::endl ;
+    }
+    std::cout.flush();
+    MPI_Barrier( comm );
+  }
+
+  //------------------------------------
+
+  Kokkos::Timer wall_clock ;
+
+  Perf perf_stats = Perf() ;
+
+  for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) {
+
+    Perf perf = Perf() ;
+
+    perf.global_elem_count = fixture.elem_count_global();
+    perf.global_node_count = fixture.node_count_global();
+
+    //----------------------------------
+    // Create the sparse matrix graph and element-to-graph map
+    // from the element->to->node identifier array.
+    // The graph only has rows for the owned nodes.
+
+    typename NodeNodeGraphType::Times graph_times;
+
+    const NodeNodeGraphType
+      mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(), graph_times );
+
+    perf.map_ratio          = maximum(comm, graph_times.ratio);
+    perf.fill_node_set      = maximum(comm, graph_times.fill_node_set);
+    perf.scan_node_count    = maximum(comm, graph_times.scan_node_count);
+    perf.fill_graph_entries = maximum(comm, graph_times.fill_graph_entries);
+    perf.sort_graph_entries = maximum(comm, graph_times.sort_graph_entries);
+    perf.fill_element_graph = maximum(comm, graph_times.fill_element_graph);
+
+    wall_clock.reset();
+    // Create the sparse matrix from the graph:
+
+    SparseMatrixType jacobian( mesh_to_graph.graph );
+
+    Space::fence();
+
+    perf.create_sparse_matrix = maximum( comm , wall_clock.seconds() );
+
+    //----------------------------------
+
+    for ( int k = 0 ; k < comm_size && print_flag ; ++k ) {
+      if ( k == comm_rank ) {
+        const unsigned nrow = jacobian.graph.numRows();
+        std::cout << "MPI[" << comm_rank << "]" << std::endl ;
+        std::cout << "JacobianGraph {" << std::endl ;
+        for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
+          std::cout << "  row[" << irow << "]{" ;
+          const unsigned entry_end = jacobian.graph.row_map(irow+1);
+          for ( unsigned entry = jacobian.graph.row_map(irow) ; entry < entry_end ; ++entry ) {
+            std::cout << " " << jacobian.graph.entries(entry);
+          }
+          std::cout << " }" << std::endl ;
+        }
+        std::cout << "}" << std::endl ;
+
+        std::cout << "ElemGraph {" << std::endl ;
+        for ( unsigned ielem = 0 ; ielem < mesh_to_graph.elem_graph.extent(0) ; ++ielem ) {
+          std::cout << "  elem[" << ielem << "]{" ;
+          for ( unsigned irow = 0 ; irow < mesh_to_graph.elem_graph.extent(1) ; ++irow ) {
+            std::cout << " {" ;
+            for ( unsigned icol = 0 ; icol < mesh_to_graph.elem_graph.extent(2) ; ++icol ) {
+              std::cout << " " << mesh_to_graph.elem_graph(ielem,irow,icol);
+            }
+            std::cout << " }" ;
+          }
+          std::cout << " }" << std::endl ;
+        }
+        std::cout << "}" << std::endl ;
+      }
+      std::cout.flush();
+      MPI_Barrier( comm );
+    }
+
+    //----------------------------------
+
+    // Allocate solution vector for each node in the mesh and residual vector for each owned node
+    const VectorType nodal_solution( "nodal_solution" , fixture.node_count() );
+    const VectorType nodal_residual( "nodal_residual" , fixture.node_count_owned() );
+    const VectorType nodal_delta(    "nodal_delta" ,    fixture.node_count_owned() );
+
+    // Create element computation functor
+    const ElementComputationType elemcomp(
+      use_atomic ? ElementComputationType( fixture , manufactured_solution.K , nodal_solution ,
+                                           mesh_to_graph.elem_graph , jacobian , nodal_residual )
+                 : ElementComputationType( fixture , manufactured_solution.K , nodal_solution ) );
+
+    const NodeElemGatherFillType gatherfill(
+      use_atomic ? NodeElemGatherFillType()
+                 : NodeElemGatherFillType( fixture.elem_node() ,
+                                           mesh_to_graph.elem_graph ,
+                                           nodal_residual ,
+                                           jacobian ,
+                                           elemcomp.elem_residuals ,
+                                           elemcomp.elem_jacobians ) );
+
+    // Create boundary condition functor
+    const DirichletComputationType dirichlet(
+      fixture , nodal_solution , jacobian , nodal_residual ,
+      2 /* apply at 'z' ends */ ,
+      manufactured_solution.T_zmin ,
+      manufactured_solution.T_zmax );
+
+    //----------------------------------
+    // Nonlinear Newton iteration:
+
+    double residual_norm_init = 0 ;
+
+    for ( perf.newton_iter_count = 0 ;
+          perf.newton_iter_count < newton_iteration_limit ;
+          ++perf.newton_iter_count ) {
+
+      //--------------------------------
+
+      comm_nodal_import( nodal_solution );
+
+      //--------------------------------
+      // Element contributions to residual and jacobian
+
+      wall_clock.reset();
+
+      Kokkos::deep_copy( nodal_residual , double(0) );
+      Kokkos::deep_copy( jacobian.coeff , double(0) );
+
+      elemcomp.apply();
+
+      if ( ! use_atomic ) {
+        gatherfill.apply();
+      }
+
+      Space::fence();
+      perf.fill_time = maximum( comm , wall_clock.seconds() );
+
+      //--------------------------------
+      // Apply boundary conditions
+
+      wall_clock.reset();
+
+      dirichlet.apply();
+
+      Space::fence();
+      perf.bc_time = maximum( comm , wall_clock.seconds() );
+
+      //--------------------------------
+      // Evaluate convergence
+
+      const double residual_norm =
+        std::sqrt(
+          Kokkos::Example::all_reduce(
+            Kokkos::Example::dot( fixture.node_count_owned() , nodal_residual, nodal_residual ) , comm ) );
+
+      perf.newton_residual = residual_norm ;
+
+      if ( 0 == perf.newton_iter_count ) { residual_norm_init = residual_norm ; }
+
+      if ( residual_norm < residual_norm_init * newton_iteration_tolerance ) { break ; }
+
+      //--------------------------------
+      // Solve for nonlinear update
+
+      CGSolveResult cg_result ;
+
+      Kokkos::Example::cgsolve( comm_nodal_import
+                              , jacobian
+                              , nodal_residual
+                              , nodal_delta
+                              , cg_iteration_limit
+                              , cg_iteration_tolerance
+                              , & cg_result
+                              );
+
+      // Update solution vector
+
+      Kokkos::Example::waxpby( fixture.node_count_owned() , nodal_solution , -1.0 , nodal_delta , 1.0 , nodal_solution );
+
+      perf.cg_iter_count += cg_result.iteration ;
+      perf.matvec_time   += cg_result.matvec_time ;
+      perf.cg_time       += cg_result.iter_time ;
+
+      //--------------------------------
+
+      if ( print_flag ) {
+        const double delta_norm =
+          std::sqrt(
+            Kokkos::Example::all_reduce(
+              Kokkos::Example::dot( fixture.node_count_owned() , nodal_delta, nodal_delta ) , comm ) );
+
+        if ( 0 == comm_rank ) {
+          std::cout << "Newton iteration[" << perf.newton_iter_count << "]"
+                    << " residual[" << perf.newton_residual << "]"
+                    << " update[" << delta_norm << "]"
+                    << " cg_iteration[" << cg_result.iteration << "]"
+                    << " cg_residual[" << cg_result.norm_res << "]"
+                    << std::endl ;
+        }
+
+        for ( int k = 0 ; k < comm_size ; ++k ) {
+          if ( k == comm_rank ) {
+            const unsigned nrow = jacobian.graph.numRows();
+
+            std::cout << "MPI[" << comm_rank << "]" << std::endl ;
+            std::cout << "Residual {" ;
+            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
+              std::cout << " " << nodal_residual(irow);
+            }
+            std::cout << " }" << std::endl ;
+
+            std::cout << "Delta {" ;
+            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
+              std::cout << " " << nodal_delta(irow);
+            }
+            std::cout << " }" << std::endl ;
+
+            std::cout << "Solution {" ;
+            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
+              std::cout << " " << nodal_solution(irow);
+            }
+            std::cout << " }" << std::endl ;
+
+            std::cout << "Jacobian[ "
+                      << jacobian.graph.numRows() << " x " << Kokkos::maximum_entry( jacobian.graph )
+                      << " ] {" << std::endl ;
+            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
+              std::cout << "  {" ;
+              const unsigned entry_end = jacobian.graph.row_map(irow+1);
+              for ( unsigned entry = jacobian.graph.row_map(irow) ; entry < entry_end ; ++entry ) {
+                std::cout << " (" << jacobian.graph.entries(entry)
+                          << "," << jacobian.coeff(entry)
+                          << ")" ;
+              }
+              std::cout << " }" << std::endl ;
+            }
+            std::cout << "}" << std::endl ;
+          }
+          std::cout.flush();
+          MPI_Barrier( comm );
+        }
+      }
+      //--------------------------------
+    }
+
+    // Evaluate solution error
+
+    if ( 0 == itrial ) {
+      const typename FixtureType::node_coord_type::HostMirror
+        h_node_coord = Kokkos::create_mirror_view( fixture.node_coord() );
+
+      const typename VectorType::HostMirror
+        h_nodal_solution = Kokkos::create_mirror_view( nodal_solution );
+
+      Kokkos::deep_copy( h_node_coord , fixture.node_coord() );
+      Kokkos::deep_copy( h_nodal_solution , nodal_solution );
+
+      double error_max = 0 ;
+      for ( unsigned inode = 0 ; inode < fixture.node_count_owned() ; ++inode ) {
+        const double answer = manufactured_solution( h_node_coord( inode , 2 ) );
+        const double error = ( h_nodal_solution(inode) - answer ) / answer ;
+        if ( error_max < fabs( error ) ) { error_max = fabs( error ); }
+      }
+
+      perf.error_max = std::sqrt( Kokkos::Example::all_reduce_max( error_max , comm ) );
+
+      perf_stats = perf ;
+    }
+    else {
+      perf_stats.fill_node_set = std::min( perf_stats.fill_node_set , perf.fill_node_set );
+      perf_stats.scan_node_count = std::min( perf_stats.scan_node_count , perf.scan_node_count );
+      perf_stats.fill_graph_entries = std::min( perf_stats.fill_graph_entries , perf.fill_graph_entries );
+      perf_stats.sort_graph_entries = std::min( perf_stats.sort_graph_entries , perf.sort_graph_entries );
+      perf_stats.fill_element_graph = std::min( perf_stats.fill_element_graph , perf.fill_element_graph );
+      perf_stats.create_sparse_matrix = std::min( perf_stats.create_sparse_matrix , perf.create_sparse_matrix );
+      perf_stats.fill_time = std::min( perf_stats.fill_time , perf.fill_time );
+      perf_stats.bc_time = std::min( perf_stats.bc_time , perf.bc_time );
+      perf_stats.cg_time = std::min( perf_stats.cg_time , perf.cg_time );
+    }
+  }
+
+  return perf_stats ;
+}
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FENL_IMPL_HPP */
+
diff --git a/packages/kokkos/example/fenl/main.cpp b/packages/kokkos/example/fenl/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..086bd4d13f38744337e0341dd194149cca159226
--- /dev/null
+++ b/packages/kokkos/example/fenl/main.cpp
@@ -0,0 +1,446 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#include <cmath>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <utility>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+
+#include <Kokkos_Core.hpp>
+
+#include <WrapMPI.hpp>
+#include <fenl.hpp>
+
+// For vtune
+#include <sys/types.h>
+#include <unistd.h>
+
+//----------------------------------------------------------------------------
+
+enum { CMD_USE_THREADS = 0
+     , CMD_USE_NUMA
+     , CMD_USE_CORE_PER_NUMA
+     , CMD_USE_CUDA
+     , CMD_USE_ROCM
+     , CMD_USE_OPENMP
+     , CMD_USE_CUDA_DEV
+     , CMD_USE_FIXTURE_X
+     , CMD_USE_FIXTURE_Y
+     , CMD_USE_FIXTURE_Z
+     , CMD_USE_FIXTURE_BEGIN
+     , CMD_USE_FIXTURE_END
+     , CMD_USE_FIXTURE_QUADRATIC
+     , CMD_USE_ATOMIC
+     , CMD_USE_TRIALS
+     , CMD_VTUNE
+     , CMD_PRINT
+     , CMD_ECHO
+     , CMD_ERROR
+     , CMD_COUNT };
+
+void print_cmdline( std::ostream & s , const int cmd[] )
+{
+  if ( cmd[ CMD_USE_THREADS ] ) {
+    s << " Threads(" << cmd[ CMD_USE_THREADS ]
+      << ") NUMA(" << cmd[ CMD_USE_NUMA ]
+      << ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ]
+      << ")" ;
+  }
+  if ( cmd[ CMD_USE_OPENMP ] ) {
+    s << " OpenMP(" << cmd[ CMD_USE_OPENMP ]
+      << ") NUMA(" << cmd[ CMD_USE_NUMA ]
+      << ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ]
+      << ")" ;
+  }
+  if ( cmd[ CMD_USE_FIXTURE_X ] ) {
+    s << " Fixture(" << cmd[ CMD_USE_FIXTURE_X ]
+      << "x" << cmd[ CMD_USE_FIXTURE_Y ]
+      << "x" << cmd[ CMD_USE_FIXTURE_Z ]
+      << ")" ;
+  }
+  if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
+    s << " Fixture( " << cmd[ CMD_USE_FIXTURE_BEGIN ]
+      << " .. " << cmd[ CMD_USE_FIXTURE_END ]
+      << " )" ;
+  }
+  if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) {
+    s << " Quadratic-Element" ;
+  }
+  if ( cmd[ CMD_USE_CUDA ] ) {
+    s << " CUDA(" << cmd[ CMD_USE_CUDA_DEV ] << ")" ;
+  }
+  if ( cmd[ CMD_USE_ROCM ] ) {
+    s << " ROCM" ;
+  }
+  if ( cmd[ CMD_USE_ATOMIC ] ) {
+    s << " ATOMIC" ;
+  }
+  if ( cmd[ CMD_USE_TRIALS ] ) {
+    s << " TRIALS(" << cmd[ CMD_USE_TRIALS ] << ")" ;
+  }
+  if ( cmd[ CMD_VTUNE ] ) {
+    s << " VTUNE" ;
+  }
+  if ( cmd[ CMD_PRINT ] ) {
+    s << " PRINT" ;
+  }
+  s << std::endl ;
+}
+
+void print_perf_value( std::ostream & s , const std::vector<size_t> & widths,  const Kokkos::Example::FENL::Perf & perf )
+{
+  int i=0;
+  s << std::setw(widths[i++]) << perf.global_elem_count << " ,";
+  s << std::setw(widths[i++]) << perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << perf.newton_iter_count << " ,";
+  s << std::setw(widths[i++]) << perf.cg_iter_count << " ,";
+  s << std::setw(widths[i++]) << perf.map_ratio << " ,";
+  s << std::setw(widths[i++]) << ( perf.fill_node_set * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.scan_node_count * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.fill_graph_entries * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.sort_graph_entries * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.fill_element_graph * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.create_sparse_matrix * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.fill_time * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.bc_time * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( ( perf.matvec_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( ( perf.cg_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i])   << perf.error_max;
+  s << std::endl ;
+}
+
+template< class Device , Kokkos::Example::BoxElemPart::ElemOrder ElemOrder >
+void run( MPI_Comm comm , const int cmd[] )
+{
+  int comm_rank = 0 ;
+
+#if defined( KOKKOS_ENABLE_MPI )
+  MPI_Comm_rank( comm , & comm_rank );
+#else
+  comm = 0 ;
+#endif
+
+
+  if ( 0 == comm_rank ) {
+    if ( cmd[ CMD_USE_THREADS ] ) { std::cout << "THREADS , " << cmd[ CMD_USE_THREADS ] ; }
+    else if ( cmd[ CMD_USE_OPENMP ] ) { std::cout << "OPENMP , " << cmd[ CMD_USE_OPENMP ] ; }
+    else if ( cmd[ CMD_USE_CUDA ] ) { std::cout << "CUDA" ; }
+    else if ( cmd[ CMD_USE_ROCM ] ) { std::cout << "ROCM" ; }
+
+    if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { std::cout << " , QUADRATIC-ELEMENT" ; }
+    else { std::cout << " , LINEAR-ELEMENT" ; }
+
+    if ( cmd[ CMD_USE_ATOMIC ] ) { std::cout << " , USING ATOMICS" ; }
+  }
+
+  std::vector< std::pair<std::string,std::string> > headers;
+
+
+  headers.push_back(std::make_pair("ELEMS","count"));
+  headers.push_back(std::make_pair("NODES","count"));
+  headers.push_back(std::make_pair("NEWTON","iter"));
+  headers.push_back(std::make_pair("CG","iter"));
+  headers.push_back(std::make_pair("MAP_RATIO","ratio"));
+  headers.push_back(std::make_pair("SET_FILL/NODE","millisec"));
+  headers.push_back(std::make_pair("SCAN/NODE","millisec"));
+  headers.push_back(std::make_pair("GRAPH_FILL/NODE","millisec"));
+  headers.push_back(std::make_pair("SORT/NODE","millisec"));
+  headers.push_back(std::make_pair("ELEM_GRAPH_FILL/NODE","millisec"));
+  headers.push_back(std::make_pair("MATRIX_CREATE/NODE","millisec"));
+  headers.push_back(std::make_pair("MATRIX_FILL/NODE","millisec"));
+  headers.push_back(std::make_pair("BOUNDARY/NODE","millisec"));
+  headers.push_back(std::make_pair("MAT_VEC/ITER/ROW","millisec"));
+  headers.push_back(std::make_pair("CG/ITER/ROW","millisec"));
+  headers.push_back(std::make_pair("ERROR","ratio"));
+
+  // find print widths
+  size_t min_width = 10;
+  std::vector< size_t > widths(headers.size());
+  for (size_t i=0, ie=headers.size(); i<ie; ++i)
+    widths[i] = std::max(min_width, headers[i].first.size()+1);
+
+  // print column headers
+  if ( 0 == comm_rank ) {
+    std::cout << std::endl ;
+    for (size_t i=0; i<headers.size(); ++i)
+      std::cout << std::setw(widths[i]) << headers[i].first << " ,";
+    std::cout << "\b\b  " << std::endl;
+    for (size_t i=0; i<headers.size(); ++i)
+      std::cout << std::setw(widths[i]) << headers[i].second << " ,";
+    std::cout << "\b\b  " << std::endl;
+
+    std::cout << std::scientific;
+    std::cout.precision(3);
+  }
+
+  if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
+    for ( int i = cmd[CMD_USE_FIXTURE_BEGIN] ; i < cmd[CMD_USE_FIXTURE_END] * 2 ; i *= 2 ) {
+      int nelem[3] ;
+      nelem[0] = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
+      nelem[1] = 1 + nelem[0] ;
+      nelem[2] = 2 * nelem[0] ;
+
+      const Kokkos::Example::FENL::Perf perf =
+        cmd[ CMD_USE_FIXTURE_QUADRATIC ]
+        ? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
+            ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
+        : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
+            ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
+        ;
+
+      if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
+    }
+  }
+  else {
+    int nelem[3] = { cmd[ CMD_USE_FIXTURE_X ] ,
+                     cmd[ CMD_USE_FIXTURE_Y ] ,
+                     cmd[ CMD_USE_FIXTURE_Z ] };
+
+    const Kokkos::Example::FENL::Perf perf =
+      cmd[ CMD_USE_FIXTURE_QUADRATIC ]
+      ? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
+          ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
+      : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
+          ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
+      ;
+
+    if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+int main( int argc , char ** argv )
+{
+  int comm_rank = 0 ;
+
+#if defined( KOKKOS_ENABLE_MPI )
+  MPI_Init( & argc , & argv );
+  MPI_Comm comm = MPI_COMM_WORLD ;
+  MPI_Comm_rank( comm , & comm_rank );
+#else
+  MPI_Comm comm = 0 ;
+  (void) comm ; // suppress warning
+#endif
+
+  int cmdline[ CMD_COUNT ] ;
+
+  for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;
+
+  if ( 0 == comm_rank ) {
+    for ( int i = 1 ; i < argc ; ++i ) {
+      if ( 0 == strcasecmp( argv[i] , "threads" ) ) {
+        cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] );
+      }
+      else if ( 0 == strcasecmp( argv[i] , "openmp" ) ) {
+        cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
+      }
+      else if ( 0 == strcasecmp( argv[i] , "cores" ) ) {
+        sscanf( argv[++i] , "%dx%d" ,
+                cmdline + CMD_USE_NUMA ,
+                cmdline + CMD_USE_CORE_PER_NUMA );
+      }
+      else if ( 0 == strcasecmp( argv[i] , "cuda" ) ) {
+        cmdline[ CMD_USE_CUDA ] = 1 ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "cuda-dev" ) ) {
+        cmdline[ CMD_USE_CUDA ] = 1 ;
+        cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "rocm" ) ) {
+        cmdline[ CMD_USE_ROCM ] = 1 ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) {
+        sscanf( argv[++i] , "%dx%dx%d" ,
+                cmdline + CMD_USE_FIXTURE_X ,
+                cmdline + CMD_USE_FIXTURE_Y ,
+                cmdline + CMD_USE_FIXTURE_Z );
+      }
+      else if ( 0 == strcasecmp( argv[i] , "fixture-range" ) ) {
+        sscanf( argv[++i] , "%d..%d" ,
+                cmdline + CMD_USE_FIXTURE_BEGIN ,
+                cmdline + CMD_USE_FIXTURE_END );
+      }
+      else if ( 0 == strcasecmp( argv[i] , "fixture-quadratic" ) ) {
+        cmdline[ CMD_USE_FIXTURE_QUADRATIC ] = 1 ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "atomic" ) ) {
+        cmdline[ CMD_USE_ATOMIC ] = 1 ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "trials" ) ) {
+        cmdline[ CMD_USE_TRIALS ] = atoi( argv[++i] ) ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "vtune" ) ) {
+        cmdline[ CMD_VTUNE ] = 1 ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "print" ) ) {
+        cmdline[ CMD_PRINT ] = 1 ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "echo" ) ) {
+        cmdline[ CMD_ECHO ] = 1 ;
+      }
+      else {
+        cmdline[ CMD_ERROR ] = 1 ;
+
+        std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+      }
+    }
+
+    if ( cmdline[ CMD_ECHO ] && 0 == comm_rank ) { print_cmdline( std::cout , cmdline ); }
+  }
+
+#if defined( KOKKOS_ENABLE_MPI )
+  MPI_Bcast( cmdline , CMD_COUNT , MPI_INT , 0 , comm );
+#endif
+
+  if ( cmdline[ CMD_VTUNE ] ) {
+    std::stringstream cmd;
+    pid_t my_os_pid=getpid();
+    const std::string vtune_loc =
+      "/usr/local/intel/vtune_amplifier_xe_2013/bin64/amplxe-cl";
+    const std::string output_dir = "./vtune/vtune.";
+    const int p_rank = comm_rank;
+    cmd << vtune_loc
+        << " -collect hotspots -result-dir " << output_dir << p_rank
+        << " -target-pid " << my_os_pid << " &";
+    if (p_rank == 0)
+      std::cout << cmd.str() << std::endl;
+    system(cmd.str().c_str());
+    system("sleep 10");
+  }
+
+  if ( ! cmdline[ CMD_ERROR ] && ! cmdline[ CMD_ECHO ] ) {
+
+    if ( ! cmdline[ CMD_USE_TRIALS ] ) { cmdline[ CMD_USE_TRIALS ] = 1 ; }
+
+    if ( ! cmdline[ CMD_USE_FIXTURE_X ] && ! cmdline[ CMD_USE_FIXTURE_BEGIN ] ) {
+      cmdline[ CMD_USE_FIXTURE_X ] = 2 ;
+      cmdline[ CMD_USE_FIXTURE_Y ] = 2 ;
+      cmdline[ CMD_USE_FIXTURE_Z ] = 2 ;
+    }
+
+#if defined( KOKKOS_ENABLE_THREADS )
+
+    if ( cmdline[ CMD_USE_THREADS ] ) {
+
+      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
+        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] ,
+                                     cmdline[ CMD_USE_NUMA ] ,
+                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
+      }
+      else {
+        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] );
+      }
+
+      run< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
+
+      Kokkos::Threads::finalize();
+    }
+
+#endif
+
+#if defined( KOKKOS_ENABLE_OPENMP )
+
+    if ( cmdline[ CMD_USE_OPENMP ] ) {
+
+      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
+        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] ,
+                                     cmdline[ CMD_USE_NUMA ] ,
+                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
+      }
+      else {
+        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] );
+      }
+
+      run< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
+
+      Kokkos::OpenMP::finalize();
+    }
+
+#endif
+
+#if defined( KOKKOS_ENABLE_CUDA )
+    if ( cmdline[ CMD_USE_CUDA ] ) {
+      // Use the last device:
+
+      Kokkos::HostSpace::execution_space::initialize();
+      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cmdline[ CMD_USE_CUDA_DEV ] ) );
+
+      run< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
+
+      Kokkos::Cuda::finalize();
+      Kokkos::HostSpace::execution_space::finalize();
+    }
+
+#endif
+
+#if defined( KOKKOS_ENABLE_ROCM )
+    if ( cmdline[ CMD_USE_ROCM ] ) {
+      // Use the last device:
+
+      Kokkos::HostSpace::execution_space::initialize();
+      Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice( cmdline[ CMD_USE_ROCM ] ) );
+
+      run< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
+
+      Kokkos::Experimental::ROCm::finalize();
+      Kokkos::HostSpace::execution_space::finalize();
+    }
+
+#endif
+
+  }
+
+#if defined( KOKKOS_ENABLE_MPI )
+  MPI_Finalize();
+#endif
+
+  return cmdline[ CMD_ERROR ] ? -1 : 0 ;
+}
+
diff --git a/packages/kokkos/example/fixture/BoxElemFixture.hpp b/packages/kokkos/example/fixture/BoxElemFixture.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be2f03a27d5f52c0fe9acf0b55f736779679562a
--- /dev/null
+++ b/packages/kokkos/example/fixture/BoxElemFixture.hpp
@@ -0,0 +1,355 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_BOXELEMFIXTURE_HPP
+#define KOKKOS_EXAMPLE_BOXELEMFIXTURE_HPP
+
+#include <cstdio>
+#include <utility>
+
+#include <Kokkos_Core.hpp>
+
+#include <HexElement.hpp>
+#include <BoxElemPart.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+/** \brief  Map a grid onto a unit cube with smooth nonlinear grading
+ *          of the map.
+ */
+struct MapGridUnitCube {
+
+  const float m_a ;
+  const float m_b ;
+  const float m_c ;
+  const size_t m_max_x ;
+  const size_t m_max_y ;
+  const size_t m_max_z ;
+
+  MapGridUnitCube( const size_t grid_max_x ,
+                   const size_t grid_max_y ,
+                   const size_t grid_max_z ,
+                   const float bubble_x ,
+                   const float bubble_y ,
+                   const float bubble_z )
+    : m_a( bubble_x )
+    , m_b( bubble_y )
+    , m_c( bubble_z )
+    , m_max_x( grid_max_x )
+    , m_max_y( grid_max_y )
+    , m_max_z( grid_max_z )
+    {}
+
+  template< typename Scalar >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int grid_x ,
+                   int grid_y ,
+                   int grid_z ,
+                   Scalar & coord_x ,
+                   Scalar & coord_y ,
+                   Scalar & coord_z ) const
+    {
+      // Map to a unit cube [0,1]^3
+
+      const double x = double(grid_x) / double(m_max_x);
+      const double y = double(grid_y) / double(m_max_y);
+      const double z = double(grid_z) / double(m_max_z);
+
+      coord_x = x + x * x * ( x - 1 ) * ( x - 1 ) * m_a ;
+      coord_y = y + y * y * ( y - 1 ) * ( y - 1 ) * m_b ;
+      coord_z = z + z * z * ( z - 1 ) * ( z - 1 ) * m_c ;
+    }
+};
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+/** \brief  Generate a distributed unstructured finite element mesh
+ *          from a partitioned NX*NY*NZ box of elements.
+ *
+ *  Order owned nodes first followed by off-process nodes
+ *  grouped by owning process.
+ */
+template< class Device ,
+          BoxElemPart::ElemOrder Order ,
+          class CoordinateMap = MapGridUnitCube >
+class BoxElemFixture {
+public:
+
+  typedef Device execution_space ;
+
+  enum { SpaceDim = 3 };
+  enum { ElemNode = Order == BoxElemPart::ElemLinear ? 8 :
+                    Order == BoxElemPart::ElemQuadratic ? 27 : 0 };
+
+private:
+
+  typedef Kokkos::Example::HexElement_TensorData< ElemNode > hex_data ;
+
+  Kokkos::Example::BoxElemPart m_box_part ;
+  CoordinateMap                m_coord_map ;
+
+  Kokkos::View< double *[SpaceDim] , Device > m_node_coord ;
+  Kokkos::View< size_t *[SpaceDim] , Device > m_node_grid ;
+  Kokkos::View< size_t *[ElemNode] , Device > m_elem_node ;
+  Kokkos::View< size_t *[2] ,        Device > m_recv_node ;
+  Kokkos::View< size_t *[2] ,        Device > m_send_node ;
+  Kokkos::View< size_t * ,           Device > m_send_node_id ;
+
+  unsigned char m_elem_node_local[ ElemNode ][4] ;
+
+public:
+
+  typedef Kokkos::View< const size_t  * [ElemNode], Device > elem_node_type ;
+  typedef Kokkos::View< const double  * [SpaceDim], Device > node_coord_type ;
+  typedef Kokkos::View< const size_t  * [SpaceDim], Device > node_grid_type ;
+  typedef Kokkos::View< const size_t  * [2] , Device > comm_list_type ;
+  typedef Kokkos::View< const size_t  *     , Device > send_nodeid_type ;
+
+  inline bool ok() const { return m_box_part.ok(); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_count() const { return m_node_grid.extent(0); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_count_owned() const { return m_box_part.owns_node_count(); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_count_global() const { return m_box_part.global_node_count(); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t elem_count() const { return m_elem_node.extent(0); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t elem_count_global() const { return m_box_part.global_elem_count(); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t elem_node_local( size_t inode , int k ) const
+    { return m_elem_node_local[inode][k] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_grid( size_t inode , int iaxis ) const
+    { return m_node_grid(inode,iaxis); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_global_index( size_t local ) const
+    {
+      const size_t tmp_node_grid[SpaceDim] =
+        { m_node_grid(local,0) , m_node_grid(local,1) , m_node_grid(local,2) };
+      return m_box_part.global_node_id( tmp_node_grid );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  double node_coord( size_t inode , int iaxis ) const
+    { return m_node_coord(inode,iaxis); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_grid_max( int iaxis ) const
+    { return m_box_part.global_coord_max(iaxis); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t elem_node( size_t ielem , size_t inode ) const
+    { return m_elem_node(ielem,inode); }
+
+  elem_node_type   elem_node()   const { return m_elem_node ; }
+  node_coord_type  node_coord()  const { return m_node_coord ; }
+  node_grid_type   node_grid()   const { return m_node_grid ; }
+  comm_list_type   recv_node()   const { return m_recv_node ; }
+  comm_list_type   send_node()   const { return m_send_node ; }
+  send_nodeid_type send_nodeid() const { return m_send_node_id ; }
+
+  KOKKOS_INLINE_FUNCTION
+  BoxElemFixture( const BoxElemFixture & rhs )
+    : m_box_part(   rhs.m_box_part )
+    , m_coord_map(  rhs.m_coord_map )
+    , m_node_coord( rhs.m_node_coord )
+    , m_node_grid(  rhs.m_node_grid )
+    , m_elem_node(  rhs.m_elem_node )
+    , m_recv_node(  rhs.m_recv_node )
+    , m_send_node(  rhs.m_send_node )
+    , m_send_node_id( rhs.m_send_node_id )
+    {
+      for ( int i = 0 ; i < ElemNode ; ++i ) {
+        m_elem_node_local[i][0] = rhs.m_elem_node_local[i][0] ;
+        m_elem_node_local[i][1] = rhs.m_elem_node_local[i][1] ;
+        m_elem_node_local[i][2] = rhs.m_elem_node_local[i][2] ;
+        m_elem_node_local[i][3] = 0 ;
+      }
+    }
+
+  BoxElemFixture & operator = ( const BoxElemFixture & rhs )
+    {
+      m_box_part      = rhs.m_box_part ;
+      m_coord_map     = rhs.m_coord_map ;
+      m_node_coord    = rhs.m_node_coord ;
+      m_node_grid     = rhs.m_node_grid ;
+      m_elem_node     = rhs.m_elem_node ;
+      m_recv_node     = rhs.m_recv_node ;
+      m_send_node     = rhs.m_send_node ;
+      m_send_node_id  = rhs.m_send_node_id ;
+
+      for ( int i = 0 ; i < ElemNode ; ++i ) {
+        m_elem_node_local[i][0] = rhs.m_elem_node_local[i][0] ;
+        m_elem_node_local[i][1] = rhs.m_elem_node_local[i][1] ;
+        m_elem_node_local[i][2] = rhs.m_elem_node_local[i][2] ;
+        m_elem_node_local[i][3] = 0 ;
+      }
+      return *this ;
+    }
+
+  BoxElemFixture( const BoxElemPart::Decompose decompose ,
+                  const size_t global_size ,
+                  const size_t global_rank ,
+                  const size_t elem_nx ,
+                  const size_t elem_ny ,
+                  const size_t elem_nz ,
+                  const float bubble_x = 1.1f ,
+                  const float bubble_y = 1.2f ,
+                  const float bubble_z = 1.3f )
+  : m_box_part( Order , decompose , global_size , global_rank , elem_nx , elem_ny , elem_nz )
+  , m_coord_map( m_box_part.global_coord_max(0) ,
+                 m_box_part.global_coord_max(1) ,
+                 m_box_part.global_coord_max(2) ,
+                 bubble_x ,
+                 bubble_y ,
+                 bubble_z )
+  , m_node_coord( "fixture_node_coord" , m_box_part.uses_node_count() )
+  , m_node_grid(  "fixture_node_grid" , m_box_part.uses_node_count() )
+  , m_elem_node(  "fixture_elem_node" , m_box_part.uses_elem_count() )
+  , m_recv_node(  "fixture_recv_node" , m_box_part.recv_node_msg_count() )
+  , m_send_node(  "fixture_send_node" , m_box_part.send_node_msg_count() )
+  , m_send_node_id( "fixture_send_node_id" , m_box_part.send_node_id_count() )
+  {
+    {
+      const hex_data elem_data ;
+
+      for ( int i = 0 ; i < ElemNode ; ++i ) {
+        m_elem_node_local[i][0] = elem_data.eval_map[i][0] ;
+        m_elem_node_local[i][1] = elem_data.eval_map[i][1] ;
+        m_elem_node_local[i][2] = elem_data.eval_map[i][2] ;
+        m_elem_node_local[i][3] = 0 ;
+      }
+    }
+
+    const size_t nwork =
+      std::max( m_recv_node.extent(0) ,
+      std::max( m_send_node.extent(0) ,
+      std::max( m_send_node_id.extent(0) ,
+      std::max( m_node_grid.extent(0) ,
+                m_elem_node.extent(0) * m_elem_node.extent(1) ))));
+
+    Kokkos::parallel_for( nwork , *this );
+  }
+
+
+  // Initialization:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t i ) const
+  {
+    if ( i < m_elem_node.extent(0) * m_elem_node.extent(1) ) {
+
+      const size_t ielem = i / ElemNode ;
+      const size_t inode = i % ElemNode ;
+
+      size_t elem_grid[SpaceDim] ;
+      size_t tmp_node_grid[SpaceDim] ;
+
+      m_box_part.uses_elem_coord( ielem , elem_grid );
+
+      enum { elem_node_scale = Order == BoxElemPart::ElemLinear ? 1 :
+                               Order == BoxElemPart::ElemQuadratic ? 2 : 0 };
+
+      tmp_node_grid[0] = elem_node_scale * elem_grid[0] + m_elem_node_local[inode][0] ;
+      tmp_node_grid[1] = elem_node_scale * elem_grid[1] + m_elem_node_local[inode][1] ;
+      tmp_node_grid[2] = elem_node_scale * elem_grid[2] + m_elem_node_local[inode][2] ;
+
+      m_elem_node(ielem,inode) = m_box_part.local_node_id( tmp_node_grid );
+    }
+
+    if ( i < m_node_grid.extent(0) ) {
+      size_t tmp_node_grid[SpaceDim] ;
+      m_box_part.local_node_coord( i , tmp_node_grid );
+      m_node_grid(i,0) = tmp_node_grid[0] ;
+      m_node_grid(i,1) = tmp_node_grid[1] ;
+      m_node_grid(i,2) = tmp_node_grid[2] ;
+
+      m_coord_map( tmp_node_grid[0] ,
+                   tmp_node_grid[1] ,
+                   tmp_node_grid[2] ,
+                   m_node_coord(i,0) ,
+                   m_node_coord(i,1) ,
+                   m_node_coord(i,2) );
+    }
+
+    if ( i < m_recv_node.extent(0) ) {
+      m_recv_node(i,0) = m_box_part.recv_node_rank(i);
+      m_recv_node(i,1) = m_box_part.recv_node_count(i);
+    }
+
+    if ( i < m_send_node.extent(0) ) {
+      m_send_node(i,0) = m_box_part.send_node_rank(i);
+      m_send_node(i,1) = m_box_part.send_node_count(i);
+    }
+
+    if ( i < m_send_node_id.extent(0) ) {
+      m_send_node_id(i) = m_box_part.send_node_id(i);
+    }
+  }
+};
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXAMPLE_BOXELEMFIXTURE_HPP */
+
diff --git a/packages/kokkos/example/fixture/BoxElemPart.cpp b/packages/kokkos/example/fixture/BoxElemPart.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b85936da7f01b61fc4e6223df986442d3631c97
--- /dev/null
+++ b/packages/kokkos/example/fixture/BoxElemPart.cpp
@@ -0,0 +1,413 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <utility>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <limits>
+#include <BoxElemPart.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+void box_partition( const size_t global_size ,
+                    const size_t global_rank ,
+                    const size_t global_box[][2] ,
+                          size_t box[][2] )
+{
+  box[0][0] = global_box[0][0] ; box[0][1] = global_box[0][1] ;
+  box[1][0] = global_box[1][0] ; box[1][1] = global_box[1][1] ;
+  box[2][0] = global_box[2][0] ; box[2][1] = global_box[2][1] ;
+
+  size_t ip = 0 ;
+  size_t np = global_size ;
+
+  while ( 1 < np ) {
+
+    // P = [ ip + j * portion , ip + ( j + 1 ) * portion )
+
+    size_t jip , jup ;
+
+    {
+      const size_t part = ( 0 == ( np % 5 ) ) ? 5 : (
+                          ( 0 == ( np % 3 ) ) ? 3 : 2 );
+
+      const size_t portion = np / part ;
+
+      if ( 2 < part || global_rank < ip + portion ) {
+        jip = portion * size_t( double( global_rank - ip ) / double(portion) );
+        jup = jip + portion ;
+      }
+      else {
+        jip = portion ;
+        jup = np ;
+      }
+    }
+
+    // Choose axis with largest count:
+
+    const size_t nb[3] = {
+      box[0][1] - box[0][0] ,
+      box[1][1] - box[1][0] ,
+      box[2][1] - box[2][0] };
+
+    const int axis = nb[2] > nb[1] ? ( nb[2] > nb[0] ? 2 : 0 )
+                                        : ( nb[1] > nb[0] ? 1 : 0 );
+
+    box[ axis ][1] = box[ axis ][0] + size_t( double(nb[axis]) * ( double(jup) / double(np) ));
+    box[ axis ][0] = box[ axis ][0] + size_t( double(nb[axis]) * ( double(jip) / double(np) ));
+
+    np = jup - jip ;
+    ip = ip + jip ;
+  }
+}
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+void BoxElemPart::local( const size_t  rank ,
+                               size_t  uses_elem[][2] ,
+                               size_t  owns_node[][2] ,
+                               size_t  uses_node[][2] ) const
+{
+  if ( BoxElemPart::DecomposeElem == m_decompose ) {
+
+    Kokkos::Example::box_partition( m_global_size , rank , m_global_elem_box , uses_elem );
+
+    for ( int i = 0 ; i < 3 ; ++i ) {
+      owns_node[i][0] = uses_elem[i][0] ;
+      owns_node[i][1] = uses_elem[i][1] + ( m_global_elem_box[i][1] == uses_elem[i][1] ? 1 : 0 );
+    }
+  }
+  else {
+
+    const size_t global_vert[3][2] =
+      { { 0 , m_global_elem_box[0][1] + 1 },
+        { 0 , m_global_elem_box[1][1] + 1 },
+        { 0 , m_global_elem_box[2][1] + 1 } };
+
+    Kokkos::Example::box_partition( m_global_size , rank , global_vert , owns_node );
+
+    for ( int i = 0 ; i < 3 ; ++i ) {
+      uses_elem[i][0] = global_vert[i][0] == owns_node[i][0] ? owns_node[i][0] : owns_node[i][0] - 1 ;
+      uses_elem[i][1] = global_vert[i][1] == owns_node[i][1] ? owns_node[i][1] - 1 : owns_node[i][1] ;
+    }
+  }
+
+  for ( int i = 0 ; i < 3 ; ++i ) {
+    uses_node[i][0] = uses_elem[i][0] ;
+    uses_node[i][1] = uses_elem[i][1] + 1 ;
+  }
+
+  if ( BoxElemPart::ElemQuadratic == m_elem_order ) {
+    for ( int i = 0 ; i < 3 ; ++i ) {
+      owns_node[i][0] = 2 * owns_node[i][0] ;
+      uses_node[i][0] = 2 * uses_node[i][0] ;
+      owns_node[i][1] = 2 * owns_node[i][1] - 1 ;
+      uses_node[i][1] = 2 * uses_node[i][1] - 1 ;
+    }
+  }
+}
+
+BoxElemPart::BoxElemPart(
+  const BoxElemPart::ElemOrder elem_order ,
+  const BoxElemPart::Decompose decompose ,
+  const size_t global_size ,
+  const size_t global_rank ,
+  const size_t elem_nx ,
+  const size_t elem_ny ,
+  const size_t elem_nz )
+{
+  m_global_size = global_size ;
+  m_global_rank = global_rank ;
+  m_decompose   = decompose ;
+  m_elem_order  = elem_order ;
+
+  m_global_elem_box[0][0] = 0 ; m_global_elem_box[0][1] = elem_nx ;
+  m_global_elem_box[1][0] = 0 ; m_global_elem_box[1][1] = elem_ny ;
+  m_global_elem_box[2][0] = 0 ; m_global_elem_box[2][1] = elem_nz ;
+
+  m_global_node_box[0][0] = 0 ; m_global_node_box[0][1] = 0 ;
+  m_global_node_box[1][0] = 0 ; m_global_node_box[1][1] = 0 ;
+  m_global_node_box[2][0] = 0 ; m_global_node_box[2][1] = 0 ;
+
+  m_owns_node_count = 0 ;
+  m_send_node_count = 0 ;
+
+  m_ok = true ;
+
+  //----------------------------------------
+
+  if ( ElemLinear == elem_order ) {
+    m_global_node_box[0][1] = elem_nx + 1 ;
+    m_global_node_box[1][1] = elem_ny + 1 ;
+    m_global_node_box[2][1] = elem_nz + 1 ;
+  }
+  else if ( ElemQuadratic == elem_order ) {
+    m_global_node_box[0][1] = 2 * elem_nx + 1 ;
+    m_global_node_box[1][1] = 2 * elem_ny + 1 ;
+    m_global_node_box[2][1] = 2 * elem_nz + 1 ;
+  }
+
+  //----------------------------------------
+
+  local( m_global_rank , m_uses_elem_box , m_owns_node_box[0] , m_uses_node_box );
+
+  const size_t global_node_count_ = Kokkos::Example::box_count( m_global_node_box );
+  const size_t global_elem_count_ = Kokkos::Example::box_count( m_global_elem_box );
+
+  //----------------------------------------
+
+  size_t elem_count = Kokkos::Example::box_count( m_uses_elem_box );
+  size_t node_count = Kokkos::Example::box_count( m_owns_node_box[0] );
+
+  m_owns_node[0][0] = global_rank ;
+  m_owns_node[0][1] = node_count ;
+  m_owns_node_count = 1 ;
+  m_send_node_count = 0 ;
+
+  for ( size_t rr = 1 ; rr < m_global_size && m_ok ; ++rr ) {
+
+    const size_t rank = ( m_global_rank + rr ) % m_global_size ;
+
+    size_t elem_box[3][2] , o_node_box[3][2] , u_node_box[3][2] ;
+
+    // Boxes for process 'rank'
+    local( rank , elem_box , o_node_box , u_node_box );
+
+    // Box that this process uses but is owned by process 'rank'
+    Kokkos::Example::box_intersect( m_owns_node_box[ m_owns_node_count ] , m_uses_node_box , o_node_box );
+
+    m_owns_node[ m_owns_node_count ][1] = Kokkos::Example::box_count( m_owns_node_box[ m_owns_node_count ] );
+
+    if ( m_owns_node[ m_owns_node_count ][1] ) {
+
+      if ( ( PROC_NEIGH_MAX - 1 ) <= m_owns_node_count ) {
+        std::cout << "BoxElemPart exceeded maximum neighbor count" << std::endl ;
+        m_ok = false ;
+        break ;
+      }
+
+      m_owns_node[ m_owns_node_count ][0] = rank ;
+
+      ++m_owns_node_count ;
+    }
+
+    // Box that this process owns and is used by process 'rank'
+    Kokkos::Example::box_intersect( m_send_node_box[ m_send_node_count ] , m_owns_node_box[0] , u_node_box );
+
+    m_send_node[ m_send_node_count ][1] = Kokkos::Example::box_count( m_send_node_box[ m_send_node_count ] );
+
+    if ( m_send_node[ m_send_node_count ][1] ) {
+
+      if ( ( PROC_NEIGH_MAX - 1 ) <= m_send_node_count ) {
+        std::cout << "BoxElemPart exceeded maximum neighbor count" << std::endl ;
+        m_ok = false ;
+        break ;
+      }
+
+      m_send_node[ m_send_node_count ][0] = rank ;
+      ++m_send_node_count ;
+    }
+
+    // Error checking:
+
+    size_t test_box[3][2] ;
+
+    elem_count += Kokkos::Example::box_count( elem_box );
+    node_count += Kokkos::Example::box_count( o_node_box );
+
+    {
+      Kokkos::Example::box_intersect( test_box , m_owns_node_box[0] , o_node_box );
+
+      if ( Kokkos::Example::box_count( test_box ) ) {
+        std::cout << "Box partitioning error" << std::endl ;
+        std::cout << "owns_node[" << m_global_rank << "]{"
+                  << " [" << m_owns_node_box[0][0][0] << "," << m_owns_node_box[0][0][1] << ")"
+                  << " [" << m_owns_node_box[0][1][0] << "," << m_owns_node_box[0][1][1] << ")"
+                  << " [" << m_owns_node_box[0][2][0] << "," << m_owns_node_box[0][2][1] << ")"
+                  << "} intersects"
+                  << " owns_node[" << rank << "]{"
+                  << " [" << o_node_box[0][0] << "," << o_node_box[0][1] << ")"
+                  << " [" << o_node_box[1][0] << "," << o_node_box[1][1] << ")"
+                  << " [" << o_node_box[2][0] << "," << o_node_box[2][1] << ")"
+                  << "}" << std::endl ;
+        m_ok = false ;
+        break ;
+      }
+    }
+
+    if ( DecomposeElem == decompose ) {
+
+      Kokkos::Example::box_intersect( test_box , m_uses_elem_box , elem_box );
+
+      if ( Kokkos::Example::box_count( test_box ) ) {
+        std::cout << "Box partitioning error" << std::endl ;
+        std::cout << "ElemBox[" << m_global_rank << "]{"
+                  << " [" << m_uses_elem_box[0][0] << "," << m_uses_elem_box[0][1] << ")"
+                  << " [" << m_uses_elem_box[1][0] << "," << m_uses_elem_box[1][1] << ")"
+                  << " [" << m_uses_elem_box[2][0] << "," << m_uses_elem_box[2][1] << ")"
+                  << "} intersects"
+                  << " ElemBox[" << rank << "]{"
+                  << " [" << elem_box[0][0] << "," << elem_box[0][1] << ")"
+                  << " [" << elem_box[1][0] << "," << elem_box[1][1] << ")"
+                  << " [" << elem_box[2][0] << "," << elem_box[2][1] << ")"
+                  << "}" << std::endl ;
+        m_ok = false ;
+        break ;
+      }
+    }
+  }
+
+  // Sentinal values at the end of the owns and send lists:
+
+  m_owns_node[ m_owns_node_count ][0] = ~0u ;
+  m_owns_node[ m_owns_node_count ][1] = ~0u ;
+  m_owns_node_box[ m_owns_node_count ][0][0] = 0u ; m_owns_node_box[ m_owns_node_count ][0][0] = ~0u ;
+  m_owns_node_box[ m_owns_node_count ][1][0] = 0u ; m_owns_node_box[ m_owns_node_count ][1][0] = ~0u ;
+  m_owns_node_box[ m_owns_node_count ][2][0] = 0u ; m_owns_node_box[ m_owns_node_count ][2][0] = ~0u ;
+
+  m_send_node[ m_send_node_count ][0] = ~0u ;
+  m_send_node[ m_send_node_count ][1] = ~0u ;
+  m_send_node_box[ m_send_node_count ][0][0] = 0u ; m_send_node_box[ m_send_node_count ][0][0] = ~0u ;
+  m_send_node_box[ m_send_node_count ][1][0] = 0u ; m_send_node_box[ m_send_node_count ][1][0] = ~0u ;
+  m_send_node_box[ m_send_node_count ][2][0] = 0u ; m_send_node_box[ m_send_node_count ][2][0] = ~0u ;
+
+  {
+    size_t count = 0 ;
+    for ( size_t i = 0 ; i < m_owns_node_count ; ++i ) {
+      count += m_owns_node[i][1] ;
+    }
+    if ( count != Kokkos::Example::box_count( m_uses_node_box ) ) {
+      std::cout << "Node uses count = " << Kokkos::Example::box_count( m_uses_node_box )
+                << " error count = " << count << std::endl ;
+      m_ok = false ;
+    }
+  }
+
+  if ( global_node_count_ != node_count ) {
+    std::cout << "Node count = " << global_node_count_ << " overlap error count = " << node_count << std::endl ;
+    m_ok = false ;
+  }
+
+  if ( DecomposeElem == decompose && global_elem_count_ != elem_count ) {
+    std::cout << "Elem count = " << global_elem_count_ << " overlap error count = " << elem_count << std::endl ;
+    m_ok = false ;
+  }
+
+  if ( ! m_ok ) {
+    for ( int i = 0 ; i < 3 ; ++i ) { for ( int j = 0 ; j < 2 ; ++j ) {
+      m_global_elem_box[i][j] = 0 ;
+      m_global_node_box[i][j] = 0 ;
+      m_uses_elem_box[i][j] = 0 ;
+      m_uses_node_box[i][j] = 0 ;
+    }}
+    m_owns_node_count = 0 ;
+    m_send_node_count = 0 ;
+  }
+}
+
+void BoxElemPart::print( std::ostream & s ) const
+{
+  s << "BoxElemPart P[" << m_global_rank << ":" << m_global_size << "]"
+    << std::endl
+    << "  elem_box {"
+    << " [" << m_uses_elem_box[0][0] << "," << m_uses_elem_box[0][1] << ")"
+    << " [" << m_uses_elem_box[1][0] << "," << m_uses_elem_box[1][1] << ")"
+    << " [" << m_uses_elem_box[2][0] << "," << m_uses_elem_box[2][1] << ")"
+    << " } / {"
+    << " [" << m_global_elem_box[0][0] << "," << m_global_elem_box[0][1] << ")"
+    << " [" << m_global_elem_box[1][0] << "," << m_global_elem_box[1][1] << ")"
+    << " [" << m_global_elem_box[2][0] << "," << m_global_elem_box[2][1] << ")"
+    << " }"
+    << std::endl
+    << "  node_box {"
+    << " [" << m_owns_node_box[0][0][0] << "," << m_owns_node_box[0][0][1] << ")"
+    << " [" << m_owns_node_box[0][1][0] << "," << m_owns_node_box[0][1][1] << ")"
+    << " [" << m_owns_node_box[0][2][0] << "," << m_owns_node_box[0][2][1] << ")"
+    << " } / {"
+    << " [" << m_uses_node_box[0][0] << "," << m_uses_node_box[0][1] << ")"
+    << " [" << m_uses_node_box[1][0] << "," << m_uses_node_box[1][1] << ")"
+    << " [" << m_uses_node_box[2][0] << "," << m_uses_node_box[2][1] << ")"
+    << " } / {"
+    << " [" << m_global_node_box[0][0] << "," << m_global_node_box[0][1] << ")"
+    << " [" << m_global_node_box[1][0] << "," << m_global_node_box[1][1] << ")"
+    << " [" << m_global_node_box[2][0] << "," << m_global_node_box[2][1] << ")"
+    << " }"
+    << std::endl ;
+
+  for ( size_t i = 1 ; i < m_owns_node_count ; ++i ) {
+    s << "  P[" << m_owns_node[i][0] << "]"
+      << " recv node_box {"
+      << " [" << m_owns_node_box[i][0][0] << "," << m_owns_node_box[i][0][1] << ")"
+      << " [" << m_owns_node_box[i][1][0] << "," << m_owns_node_box[i][1][1] << ")"
+      << " [" << m_owns_node_box[i][2][0] << "," << m_owns_node_box[i][2][1] << ")"
+      << " }"
+      << std::endl ;
+  }
+
+  for ( size_t i = 0 ; i < m_send_node_count ; ++i ) {
+    s << "  P[" << m_send_node[i][0] << "]"
+      << " send node_box {"
+      << " [" << m_send_node_box[i][0][0] << "," << m_send_node_box[i][0][1] << ")"
+      << " [" << m_send_node_box[i][1][0] << "," << m_send_node_box[i][1][1] << ")"
+      << " [" << m_send_node_box[i][2][0] << "," << m_send_node_box[i][2][1] << ")"
+      << " }"
+      << std::endl ;
+  }
+}
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+
diff --git a/packages/kokkos/example/fixture/BoxElemPart.hpp b/packages/kokkos/example/fixture/BoxElemPart.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cd91c6f6f6e5b5063c8f819d9d0b71350a148d3
--- /dev/null
+++ b/packages/kokkos/example/fixture/BoxElemPart.hpp
@@ -0,0 +1,320 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BOXELEMPART_HPP
+#define KOKKOS_BOXELEMPART_HPP
+
+#include <utility>
+#include <ostream>
+#include <Kokkos_Macros.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+KOKKOS_INLINE_FUNCTION
+void box_intersect( size_t box[][2] ,
+                    const size_t boxA[][2] ,
+                    const size_t boxB[][2] )
+{
+  for ( int i = 0 ; i < 3 ; ++i ) {
+    box[i][0] = boxA[i][0] > boxB[i][0] ? boxA[i][0] : boxB[i][0] ;
+    box[i][1] = boxA[i][1] < boxB[i][1] ? boxA[i][1] : boxB[i][1] ;
+    if ( box[i][0] > box[i][1] ) box[i][1] = box[i][0] ;
+  }
+}
+
+KOKKOS_INLINE_FUNCTION
+size_t box_count( const size_t box[][2] )
+{
+  return size_t( box[0][1] - box[0][0] ) *
+         size_t( box[1][1] - box[1][0] ) *
+         size_t( box[2][1] - box[2][0] );
+}
+
+KOKKOS_INLINE_FUNCTION
+void box_ghost_layer( const size_t global_box[][2] ,
+                      const size_t local_box[][2] ,
+                      const size_t ghost_layer ,
+                            size_t ghost_box[][2] )
+{
+  for ( int i = 0 ; i < 3 ; ++i ) {
+    ghost_box[i][0] = global_box[i][0] + ghost_layer > local_box[i][0] ? global_box[i][0] : local_box[i][0] - ghost_layer ;
+    ghost_box[i][1] = global_box[i][1] < local_box[i][1] + ghost_layer ? global_box[i][1] : local_box[i][1] + ghost_layer ;
+  }
+}
+
+void box_partition( const size_t global_size ,
+                    const size_t global_rank ,
+                    const size_t global_box[][2] ,
+                          size_t box[][2] );
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+/** \brief Partition a box of hexahedral elements among subdomains.
+ *
+ *  Nodes are ordered locally as follows:
+ *    { owned_by[ this_process ] ,
+ *      owned_by[ neighbor_process[0] ] ,
+ *      owned_by[ neighbor_process[1] ] ,
+ *      owned_by[ neighbor_process[2] ] ,
+ *      ... };
+ */
+class BoxElemPart {
+public:
+
+  enum Decompose { DecomposeNode , DecomposeElem };
+  enum ElemOrder { ElemLinear , ElemQuadratic };
+
+  bool ok() const { return m_ok ; }
+
+  BoxElemPart( const ElemOrder elem_order ,
+               const Decompose decompose ,
+               const size_t global_size ,
+               const size_t global_rank ,
+               const size_t elem_nx ,
+               const size_t elem_ny ,
+               const size_t elem_nz );
+
+  KOKKOS_INLINE_FUNCTION
+  size_t global_elem_count() const
+    { return Kokkos::Example::box_count( m_global_elem_box ); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t global_node_count() const
+    { return Kokkos::Example::box_count( m_global_node_box ); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t uses_elem_count() const
+    { return Kokkos::Example::box_count( m_uses_elem_box ); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t owns_node_count() const
+    { return Kokkos::Example::box_count( m_owns_node_box[0] ); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t uses_node_count() const
+    { return Kokkos::Example::box_count( m_uses_node_box ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  size_t uses_elem_offset( const size_t ix ,
+                           const size_t iy ,
+                           const size_t iz ) const
+  {
+    return size_t( ix - m_uses_elem_box[0][0] ) + size_t( m_uses_elem_box[0][1] - m_uses_elem_box[0][0] ) * (
+           size_t( iy - m_uses_elem_box[1][0] ) + size_t( m_uses_elem_box[1][1] - m_uses_elem_box[1][0] ) * (
+           size_t( iz - m_uses_elem_box[2][0] ) ) );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void uses_elem_coord( size_t lid , size_t c[] ) const
+  {
+    const size_t nx = m_uses_elem_box[0][1] - m_uses_elem_box[0][0] ;
+    const size_t ny = m_uses_elem_box[1][1] - m_uses_elem_box[1][0] ;
+
+    c[0] = m_uses_elem_box[0][0] + lid % nx ; lid /= nx ;
+    c[1] = m_uses_elem_box[1][0] + lid % ny ; lid /= ny ;
+    c[2] = m_uses_elem_box[2][0] + lid ;
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  size_t global_coord_max( size_t axis ) const
+  { return m_global_node_box[axis][1] - 1 ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void local_node_coord( size_t lid , size_t coord[] ) const
+  {
+    // Local id within an 'owns' block (has sentinal)
+    size_t j = 0 ;
+    while ( m_owns_node[j][1] <= lid ) { lid -= m_owns_node[j][1] ; ++j ; }
+
+    // Map to global coordinates:
+    const size_t nx = m_owns_node_box[j][0][1] - m_owns_node_box[j][0][0] ;
+    const size_t ny = m_owns_node_box[j][1][1] - m_owns_node_box[j][1][0] ;
+
+    coord[0] = m_owns_node_box[j][0][0] + lid % nx ; lid /= nx ;
+    coord[1] = m_owns_node_box[j][1][0] + lid % ny ; lid /= ny ;
+    coord[2] = m_owns_node_box[j][2][0] + lid ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t local_node_id( const size_t c[] ) const
+  {
+    // Find which 'owns' block and accumulate the offset of this block:
+    size_t lid = 0 ;
+    size_t j = 0 ;
+    while ( ! ( m_owns_node_box[j][0][0] <= c[0] && c[0] < m_owns_node_box[j][0][1] &&
+                m_owns_node_box[j][1][0] <= c[1] && c[1] < m_owns_node_box[j][1][1] &&
+                m_owns_node_box[j][2][0] <= c[2] && c[2] < m_owns_node_box[j][2][1] ) ) {
+      
+      lid += m_owns_node[j][1] ;
+      ++j ;
+    }
+
+    // Map offset to the block plus offset within the block:
+    return lid +
+           size_t( c[0] - m_owns_node_box[j][0][0] ) + size_t( m_owns_node_box[j][0][1] - m_owns_node_box[j][0][0] ) * (
+           size_t( c[1] - m_owns_node_box[j][1][0] ) + size_t( m_owns_node_box[j][1][1] - m_owns_node_box[j][1][0] ) * (
+           size_t( c[2] - m_owns_node_box[j][2][0] ) ) );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t global_node_id( const size_t c[] ) const
+  {
+    return size_t( c[0] - m_global_node_box[0][0] ) + size_t( m_global_node_box[0][1] - m_global_node_box[0][0] ) * (
+           size_t( c[1] - m_global_node_box[1][0] ) + size_t( m_global_node_box[1][1] - m_global_node_box[1][0] ) * (
+           size_t( c[2] - m_global_node_box[2][0] ) ) );
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  size_t recv_node_msg_count() const { return m_owns_node_count - 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t recv_node_rank(  size_t msg ) const { return m_owns_node[msg+1][0] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t recv_node_count( size_t msg ) const { return m_owns_node[msg+1][1] ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  size_t send_node_msg_count() const { return m_send_node_count ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t send_node_rank(  size_t msg ) const { return m_send_node[msg][0] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t send_node_count( size_t msg ) const { return m_send_node[msg][1] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t send_node_id_count() const
+  {
+    size_t count = 0 ;
+    for ( size_t i = 0 ; i < m_send_node_count ; ++i ) {
+      count += m_send_node[i][1] ;
+    }
+    return count ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t send_node_id( size_t item ) const
+  {
+    // Find which send list this send item is in:
+    size_t j = 0 ;
+    while ( m_send_node[j][1] <= item ) { item -= m_send_node[j][1] ; ++j ; }
+
+    // Map to global coordinate:
+    const size_t nx = m_send_node_box[j][0][1] - m_send_node_box[j][0][0] ;
+    const size_t ny = m_send_node_box[j][1][1] - m_send_node_box[j][1][0] ;
+
+    size_t c[3] ;
+
+    c[0] = m_send_node_box[j][0][0] + item % nx ; item /= nx ;
+    c[1] = m_send_node_box[j][1][0] + item % ny ; item /= ny ;
+    c[2] = m_send_node_box[j][2][0] + item ;
+
+    // Map to local id:
+    return size_t( c[0] - m_owns_node_box[0][0][0] ) + size_t( m_owns_node_box[0][0][1] - m_owns_node_box[0][0][0] ) * (
+           size_t( c[1] - m_owns_node_box[0][1][0] ) + size_t( m_owns_node_box[0][1][1] - m_owns_node_box[0][1][0] ) * (
+           size_t( c[2] - m_owns_node_box[0][2][0] ) ) );
+  }
+
+  //----------------------------------------
+
+  void print( std::ostream & s ) const ;
+
+private:
+
+  // Maximum number of processes in a neighborhood, including this process
+  enum { PROC_NEIGH_MAX = 64 };
+
+  void local( const size_t  rank ,
+                    size_t  uses_elem[][2] ,
+                    size_t  owns_node[][2] ,
+                    size_t  uses_node[][2] ) const ;
+
+  size_t  m_global_size ;
+  size_t  m_global_rank ;
+
+  Decompose m_decompose ;
+  ElemOrder m_elem_order ;
+
+  size_t m_global_elem_box[3][2] ;
+  size_t m_global_node_box[3][2] ;
+  size_t m_uses_elem_box[3][2] ;
+  size_t m_uses_node_box[3][2] ;
+
+  // [ processor rank , count ]
+  size_t m_owns_node_box[ PROC_NEIGH_MAX ][3][2] ;
+  size_t m_owns_node[     PROC_NEIGH_MAX ][2] ;
+  size_t m_owns_node_count ;
+
+  size_t m_send_node_box[ PROC_NEIGH_MAX ][3][2] ;
+  size_t m_send_node[     PROC_NEIGH_MAX ][2] ;
+  size_t m_send_node_count ;
+
+  bool   m_ok ;
+};
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_BOXELEMPART_HPP */
+
diff --git a/packages/kokkos/example/fixture/CMakeLists.txt b/packages/kokkos/example/fixture/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..298c54c5bb3e00bf5ecaf5ad18e53de2ba405272
--- /dev/null
+++ b/packages/kokkos/example/fixture/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../common)
+
+SET(SOURCES_TEST Main.cpp TestFixture.cpp BoxElemPart.cpp )
+
+# Automatically picks up 'kokkosexample_fixture'
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  TestFixture
+  SOURCES ${SOURCES_TEST}
+  )
+
diff --git a/packages/kokkos/example/fixture/HexElement.hpp b/packages/kokkos/example/fixture/HexElement.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..72bc45fd21d81018cdd6af3cdabb49a979bf4d46
--- /dev/null
+++ b/packages/kokkos/example/fixture/HexElement.hpp
@@ -0,0 +1,270 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HEXELEMENT_HPP
+#define KOKKOS_HEXELEMENT_HPP
+
+namespace Kokkos {
+namespace Example {
+
+template< unsigned NodeCount >
+class HexElement_TensorData ;
+
+template< unsigned NodeCount , class Device >
+class HexElement_TensorEval ;
+
+//----------------------------------------------------------------------------
+/** \brief  Evaluate Hex element on interval [-1,1]^3 */
+template<>
+class HexElement_TensorData< 8 > {
+public:
+
+  static const unsigned element_node_count    = 8 ;
+  static const unsigned spatial_dimension     = 3 ;
+  static const unsigned integration_count_1d  = 2 ;
+  static const unsigned function_count_1d     = 2 ;
+
+  float values_1d [ function_count_1d ][ integration_count_1d ];
+  float derivs_1d [ function_count_1d ][ integration_count_1d ];
+  float weights_1d[ integration_count_1d ];
+
+  unsigned char eval_map[ element_node_count ][4] ;
+
+  static float eval_value_1d( const unsigned jf , const float x )
+  {
+    return 0 == jf ? 0.5 * ( 1.0 - x ) : (
+           1 == jf ? 0.5 * ( 1.0 + x ) : 0 );
+  }
+
+  static float eval_deriv_1d( const unsigned jf , const float )
+  {
+    return 0 == jf ? -0.5 : (
+           1 == jf ?  0.5 : 0 );
+  }
+
+  HexElement_TensorData()
+  {
+    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
+      { { 0 , 0 , 0 },
+        { 1 , 0 , 0 },
+        { 1 , 1 , 0 },
+        { 0 , 1 , 0 },
+        { 0 , 0 , 1 },
+        { 1 , 0 , 1 },
+        { 1 , 1 , 1 },
+        { 0 , 1 , 1 } };
+
+    weights_1d[0] = 1 ;
+    weights_1d[1] = 1 ;
+
+    const float points_1d[ integration_count_1d ] =
+      { -0.577350269 , 0.577350269 };
+
+    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
+      eval_map[i][0] = tmp_map[i][0];
+      eval_map[i][1] = tmp_map[i][1];
+      eval_map[i][2] = tmp_map[i][2];
+    }
+
+    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
+    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
+      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
+      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
+    }}
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+class HexElement_TensorData< 27 > {
+public:
+
+  static const unsigned element_node_count    = 27 ;
+  static const unsigned spatial_dimension     = 3 ;
+  static const unsigned integration_count_1d  = 3 ;
+  static const unsigned function_count_1d     = 3 ;
+
+  float values_1d [ function_count_1d ][ integration_count_1d ];
+  float derivs_1d [ function_count_1d ][ integration_count_1d ];
+  float weights_1d[ integration_count_1d ];
+
+  unsigned char eval_map[ element_node_count ][4] ;
+
+  // sizeof(EvaluateElementHex) = 111 bytes =
+  //   sizeof(float) * 9 +
+  //   sizeof(float) * 9 +
+  //   sizeof(float) * 3 +
+  //   sizeof(char) * 27 
+
+  static float eval_value_1d( const unsigned jf , const float p )
+  {
+    return 0 == jf ? 0.5 * p * ( p - 1 ) : (
+           1 == jf ? 1.0 - p * p : (
+           2 == jf ? 0.5 * p * ( p + 1 ) : 0 ));
+  }
+
+  static float eval_deriv_1d( const unsigned jf , const float p )
+  {
+    return 0 == jf ? p - 0.5 : (
+           1 == jf ? -2.0 * p : (
+           2 == jf ? p + 0.5 : 0 ));
+  }
+
+  HexElement_TensorData()
+  {
+    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
+      { { 0 , 0 , 0 },
+        { 2 , 0 , 0 },
+        { 2 , 2 , 0 },
+        { 0 , 2 , 0 },
+        { 0 , 0 , 2 },
+        { 2 , 0 , 2 },
+        { 2 , 2 , 2 },
+        { 0 , 2 , 2 },
+        { 1 , 0 , 0 },
+        { 2 , 1 , 0 },
+        { 1 , 2 , 0 },
+        { 0 , 1 , 0 },
+        { 0 , 0 , 1 },
+        { 2 , 0 , 1 },
+        { 2 , 2 , 1 },
+        { 0 , 2 , 1 },
+        { 1 , 0 , 2 },
+        { 2 , 1 , 2 },
+        { 1 , 2 , 2 },
+        { 0 , 1 , 2 },
+        { 1 , 1 , 1 },
+        { 1 , 1 , 0 },
+        { 1 , 1 , 2 },
+        { 0 , 1 , 1 },
+        { 2 , 1 , 1 },
+        { 1 , 0 , 1 },
+        { 1 , 2 , 1 } };
+
+    // Interval [-1,1]
+
+    weights_1d[0] = 0.555555556 ;
+    weights_1d[1] = 0.888888889 ;
+    weights_1d[2] = 0.555555556 ;
+
+    const float points_1d[3] = { -0.774596669 ,
+                                  0.000000000 ,
+                                  0.774596669 };
+
+    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
+      eval_map[i][0] = tmp_map[i][0];
+      eval_map[i][1] = tmp_map[i][1];
+      eval_map[i][2] = tmp_map[i][2];
+    }
+
+    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
+    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
+      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
+      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
+    }}
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< unsigned NodeCount >
+class HexElement_Data {
+public:
+  static const unsigned spatial_dimension   = 3 ;
+  static const unsigned element_node_count  = NodeCount ;
+  static const unsigned integration_count   = NodeCount ;
+  static const unsigned function_count      = NodeCount ;
+
+  float weights[   integration_count ] ;
+  float values[    integration_count ][ function_count ];
+  float gradients[ integration_count ][ spatial_dimension ][ function_count ];
+
+  HexElement_Data()
+  {
+    HexElement_TensorData< NodeCount > tensor_data ;
+
+    for ( unsigned ip = 0 ; ip < integration_count ; ++ip ) {
+
+      const unsigned ipx = tensor_data.eval_map[ip][0] ;
+      const unsigned ipy = tensor_data.eval_map[ip][1] ;
+      const unsigned ipz = tensor_data.eval_map[ip][2] ;
+
+      weights[ip] = tensor_data.weights_1d[ ipx ] *
+                    tensor_data.weights_1d[ ipy ] *
+                    tensor_data.weights_1d[ ipz ] ;
+
+      for ( unsigned jf = 0 ; jf < function_count ; ++jf ) {
+
+        const unsigned jfx = tensor_data.eval_map[jf][0] ;
+        const unsigned jfy = tensor_data.eval_map[jf][1] ;
+        const unsigned jfz = tensor_data.eval_map[jf][2] ;
+
+        values[ip][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                         tensor_data.values_1d[ ipy ][ jfy ] *
+                         tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][0][jf] = tensor_data.derivs_1d[ ipx ][ jfx ] *
+                               tensor_data.values_1d[ ipy ][ jfy ] *
+                               tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][1][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                               tensor_data.derivs_1d[ ipy ][ jfy ] *
+                               tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][2][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                               tensor_data.values_1d[ ipy ][ jfy ] *
+                               tensor_data.derivs_1d[ ipz ][ jfz ] ;
+      }
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_HEXELEMENT_HPP */
+
+
diff --git a/packages/kokkos/example/fixture/Main.cpp b/packages/kokkos/example/fixture/Main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76e9214bd72a0f9f998b1ae66d35fb7da9703b13
--- /dev/null
+++ b/packages/kokkos/example/fixture/Main.cpp
@@ -0,0 +1,315 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#include <utility>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+#include <BoxElemPart.hpp>
+
+namespace Kokkos {
+namespace Example {
+template< class > void test_fixture();
+}
+}
+
+int test_box( const size_t global_size
+            , const size_t global_box[][2]
+            , const bool print_verbose )
+{
+  size_t global_count = 0 ;
+  size_t global_max = 0 ;
+  size_t global_min = Kokkos::Example::box_count( global_box );
+  size_t global_box_max[3][2] = { { 0 , 0 } , { 0 , 0 } , { 0 , 0 } };
+  size_t global_box_min[3][2] = { { 0 , global_box[0][1] } , { 0 , global_box[1][1] } , { 0 , global_box[2][1] } };
+  size_t intersect_error = 0 ;
+  size_t neighbor_max = 0 ;
+
+  for ( size_t global_rank = 0 ; global_rank < global_size ; ++global_rank ) {
+    size_t box[3][2] = { { 0 , global_box[0][1] } , { 0 , global_box[1][1] } , { 0 , global_box[2][1] } };
+    size_t ghost_box[3][2] ;
+    size_t neighbor_count = 0 ;
+
+    Kokkos::Example::box_partition( global_size , global_rank , global_box , box );
+
+    Kokkos::Example::box_ghost_layer( global_box , box , 1 , ghost_box );
+
+    {
+      const size_t n = Kokkos::Example::box_count( box );
+
+      for ( int i = 0 ; i < 3 ; ++i ) {
+        if ( ( box[i][1] - box[i][0] ) < ( global_box_min[i][1] - global_box_min[i][0] ) ) {
+          global_box_min[i][0] = box[i][0] ;
+          global_box_min[i][1] = box[i][1] ;
+        }
+        if ( ( box[i][1] - box[i][0] ) > ( global_box_max[i][1] - global_box_max[i][0] ) ) {
+          global_box_max[i][0] = box[i][0] ;
+          global_box_max[i][1] = box[i][1] ;
+        }
+      }
+
+      global_max = std::max( global_max , n );
+      global_min = std::min( global_min , n );
+      global_count += n ;
+    }
+
+    for ( size_t other_rank = 0 ; other_rank  < global_size ; ++other_rank ) {
+
+      if ( other_rank == global_rank ) continue ;
+
+      size_t other_box[3][2] = { { 0 , global_box[0][1] } , { 0 , global_box[1][1] } , { 0 , global_box[2][1] } };
+      size_t intersect_box[3][2] ;
+
+      Kokkos::Example::box_partition( global_size , other_rank , global_box , other_box );
+
+      Kokkos::Example::box_intersect( intersect_box , box , other_box );
+
+      const size_t n = Kokkos::Example::box_count( intersect_box );
+
+      intersect_error += n ;
+
+      Kokkos::Example::box_intersect( intersect_box , ghost_box , other_box );
+
+      neighbor_count += Kokkos::Example::box_count( intersect_box ) ? 1 : 0 ;
+
+      if ( n ) {
+        std::cout << "box partition intersection error" << std::endl ;
+        std::cout << "box = {"
+                  << " [ " << box[0][0] << " , " << box[0][1] << " )"
+                  << " [ " << box[1][0] << " , " << box[1][1] << " )"
+                  << " [ " << box[2][0] << " , " << box[2][1] << " )"
+                  << " }" << std::endl ;
+        std::cout << "other_box = {"
+                  << " [ " << other_box[0][0] << " , " << other_box[0][1] << " )"
+                  << " [ " << other_box[1][0] << " , " << other_box[1][1] << " )"
+                  << " [ " << other_box[2][0] << " , " << other_box[2][1] << " )"
+                  << " }" << std::endl ;
+        return 0 ;
+      }
+    }
+
+    neighbor_max = std::max( neighbor_max , neighbor_count );
+  }
+
+  if ( print_verbose ) {
+
+    std::cout << "global_part = " << global_size << std::endl ;
+    std::cout << "global_box  = { "
+              << " [ " << global_box[0][0] << " .. " << global_box[0][1] << " ) X"
+              << " [ " << global_box[1][0] << " .. " << global_box[1][1] << " ) X"
+              << " [ " << global_box[2][0] << " .. " << global_box[2][1] << " )"
+              << " }" << std::endl ;
+    std::cout << "count( global_box ) = " << Kokkos::Example::box_count( global_box ) << std::endl ;
+    std::cout << "sum partition( global_box ) = " << global_count << std::endl ;
+    std::cout << "avg partition( global_box ) = " << size_t( double(global_count) / double(global_size)) << std::endl ;
+    std::cout << "min partition( global_box ) = " << global_min << std::endl ;
+    std::cout << "min part X   ( global_box ) = [ " << global_box_min[0][0] << " .. " << global_box_min[0][1] << " )" << std::endl ;
+    std::cout << "min part Y   ( global_box ) = [ " << global_box_min[1][0] << " .. " << global_box_min[1][1] << " )" << std::endl ;
+    std::cout << "min part Z   ( global_box ) = [ " << global_box_min[2][0] << " .. " << global_box_min[2][1] << " )" << std::endl ;
+    std::cout << "max partition( global_box ) = " << global_max << std::endl ;
+    std::cout << "max part X   ( global_box ) = [ " << global_box_max[0][0] << " .. " << global_box_max[0][1] << " )" << std::endl ;
+    std::cout << "max part Y   ( global_box ) = [ " << global_box_max[1][0] << " .. " << global_box_max[1][1] << " )" << std::endl ;
+    std::cout << "max part Z   ( global_box ) = [ " << global_box_max[2][0] << " .. " << global_box_max[2][1] << " )" << std::endl ;
+    std::cout << "sum intersect( global_box ) = " << intersect_error << std::endl ;
+    std::cout << "max neighbor = " << neighbor_max << std::endl ;
+  }
+
+  return neighbor_max ;
+}
+
+void test_elem()
+{
+  const Kokkos::Example::BoxElemPart::Decompose
+    decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ; // DecomposeElem | DecomposeNode ;
+  const size_t global_size = 256 ;
+  const size_t global_nx = 100 ;
+  const size_t global_ny = 120 ;
+  const size_t global_nz = 140 ;
+
+  double node_count_avg = 0 ;
+  size_t node_count_max = 0 ;
+  size_t node_count_min = ( global_nx + 1 ) * ( global_ny + 1 ) * ( global_nz + 1 );
+  double elem_count_avg = 0 ;
+  size_t elem_count_max = 0 ;
+  size_t elem_count_min = global_nx * global_ny * global_nz ;
+  double recv_count_avg = 0 ;
+  size_t recv_count_max = 0 ;
+  size_t recv_count_min = global_size ;
+  double send_count_avg = 0 ;
+  size_t send_count_max = 0 ;
+  size_t send_count_min = global_size ;
+
+  for ( size_t r = 0 ; r < global_size ; ++r ) {
+    const Kokkos::Example::BoxElemPart
+       fixture( Kokkos::Example::BoxElemPart::ElemLinear ,
+                decompose , global_size , r , global_nx , global_ny , global_nz );
+
+    // Print a sample:
+
+    // if ( r == global_size * 2 / 3 ) fixture.print( std::cout );
+
+    // Verify recv/send alignment:
+
+    {
+      size_t recv_lid = fixture.owns_node_count();
+
+      for ( size_t i = 0 ; i < fixture.recv_node_msg_count() ; ++i ) {
+        const size_t recv_rank  = fixture.recv_node_rank( i );
+        const size_t recv_count = fixture.recv_node_count( i );
+
+        const Kokkos::Example::BoxElemPart other_fixture(
+           Kokkos::Example::BoxElemPart::ElemLinear ,
+           decompose , global_size , recv_rank , global_nx , global_ny , global_nz );
+
+        size_t send_item = 0 ;
+
+        size_t j = 0 ;
+        while ( j < other_fixture.send_node_msg_count() && other_fixture.send_node_rank(j) != r ) {
+          send_item += other_fixture.send_node_count( j );
+           ++j ;
+        }
+
+        if ( recv_count != other_fixture.send_node_count(j) ) {
+          std::cout << "Error P[" << r << "].recv(" << recv_count << ") != "
+                    << "P[" << recv_rank << "].send(" << other_fixture.send_node_count(j) << ")"
+                    << std::endl ;
+        }
+        else {
+
+          for ( size_t k = 0 ; k < recv_count ; ++k , ++send_item , ++recv_lid ) {
+
+            const size_t send_lid = other_fixture.send_node_id( send_item );
+
+            size_t recv_coord[3] , send_coord[3] ;
+
+            fixture.local_node_coord( recv_lid , recv_coord );
+
+            other_fixture.local_node_coord( send_lid , send_coord );
+
+            if ( recv_coord[0] != send_coord[0] ||
+                 recv_coord[1] != send_coord[1] ||
+                 recv_coord[2] != send_coord[2] ) {
+              std::cout << "Error P[" << r << "].recv[" << recv_lid << "]{ "
+                        << recv_coord[0] << " , "
+                        << recv_coord[1] << " , "
+                        << recv_coord[2] << " } != "
+                        << "P[" << recv_rank << "].send[" << send_lid << "]{ "
+                        << send_coord[0] << " , "
+                        << send_coord[1] << " , "
+                        << send_coord[2] << " }"
+                        << std::endl ;
+            }
+          }
+        }
+      }
+    }
+
+    node_count_avg += fixture.owns_node_count();
+    elem_count_avg += fixture.uses_elem_count();
+    recv_count_avg += fixture.recv_node_msg_count();
+    send_count_avg += fixture.send_node_msg_count();
+
+    elem_count_min = std::min( (size_t) fixture.uses_elem_count() , elem_count_min );
+    elem_count_max = std::max( (size_t) fixture.uses_elem_count() , elem_count_max );
+    node_count_min = std::min( (size_t) fixture.owns_node_count() , node_count_min );
+    node_count_max = std::max( (size_t) fixture.owns_node_count() , node_count_max );
+
+    recv_count_max = std::max( (size_t) fixture.recv_node_msg_count() , recv_count_max );
+    recv_count_min = std::min( (size_t) fixture.recv_node_msg_count() , recv_count_min );
+    send_count_max = std::max( (size_t) fixture.send_node_msg_count() , send_count_max );
+    send_count_min = std::min( (size_t) fixture.send_node_msg_count() , send_count_min );
+  }
+
+  node_count_avg /= double(global_size);
+  elem_count_avg /= double(global_size);
+  recv_count_avg /= double(global_size);
+  send_count_avg /= double(global_size);
+
+  std::cout << "Elem min(" << elem_count_min << ") avg(" << elem_count_avg << ") max(" << elem_count_max << ") " << std::endl
+            << "Node min(" << node_count_min << ") avg(" << node_count_avg << ") max(" << node_count_max << ") " << std::endl
+            << "Recv min(" << recv_count_min << ") avg(" << recv_count_avg << ") max(" << recv_count_max << ") " << std::endl
+            << "Send min(" << send_count_min << ") avg(" << send_count_avg << ") max(" << send_count_max << ") " << std::endl
+            ;
+}
+
+int main()
+{
+  for ( int i = 1 ; i <= 32 ; ++i ) {
+    const size_t global_size = 16 * i ;
+    const size_t global_box[3][2] = { { 0 , 65 } , { 0 , 65 } , { 0 , 65 } };
+    if ( 30 < test_box( global_size , global_box , false ) ) {
+      test_box( global_size , global_box , true );
+    }
+  }
+
+//  test_elem();
+
+  {
+    std::cout << "test_fixture< Host >" << std::endl ;
+    Kokkos::HostSpace::execution_space::initialize( 1 );
+    Kokkos::Example::test_fixture< Kokkos::HostSpace::execution_space >();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  {
+    std::cout << "test_fixture< Cuda >" << std::endl ;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+    Kokkos::Example::test_fixture< Kokkos::Cuda >();
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_ROCM )
+  {
+    std::cout << "test_fixture< ROCm >" << std::endl ;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
+    Kokkos::Example::test_fixture< Kokkos::Experimental::ROCm >();
+    Kokkos::Experimental::ROCm::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+#endif
+}
+
diff --git a/packages/kokkos/example/fixture/Makefile b/packages/kokkos/example/fixture/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..5e684e344056cde31aec46c2a088e39c1c3bc2f9
--- /dev/null
+++ b/packages/kokkos/example/fixture/Makefile
@@ -0,0 +1,46 @@
+KOKKOS_PATH = ../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+vpath %.cpp ${KOKKOS_SRC_PATH}/example/fixture
+
+EXAMPLE_HEADERS = $(wildcard $(KOKKOS_SRC_PATH)/example/common/*.hpp ${KOKKOS_SRC_PATH}/example/fixture/*.hpp )
+
+default: build_all
+	echo "End Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+else
+  CXX = g++
+endif
+
+CXXFLAGS = -O3
+LINK ?= $(CXX)
+LDFLAGS ?=
+
+include $(KOKKOS_PATH)/Makefile.kokkos        
+
+KOKKOS_CXXFLAGS +=	\
+	-I${KOKKOS_SRC_PATH}/example/common	\
+	-I${KOKKOS_SRC_PATH}/example/fixture
+
+EXE_EXAMPLE_FIXTURE = KokkosExample_Fixture
+OBJ_EXAMPLE_FIXTURE = Main.o TestFixture.o BoxElemPart.o
+
+TARGETS = $(EXE_EXAMPLE_FIXTURE)
+
+#TEST_TARGETS =
+
+$(EXE_EXAMPLE_FIXTURE) : $(OBJ_EXAMPLE_FIXTURE) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_EXAMPLE_FIXTURE) $(KOKKOS_LIBS) $(LIB) -o $(EXE_EXAMPLE_FIXTURE)
+
+build_all : $(TARGETS)
+
+test : build_all
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(EXAMPLE_HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/packages/kokkos/example/fixture/TestFixture.cpp b/packages/kokkos/example/fixture/TestFixture.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b83d40588f0cea5652ad8811f367e3f97b594e23
--- /dev/null
+++ b/packages/kokkos/example/fixture/TestFixture.cpp
@@ -0,0 +1,62 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestFixture.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template void test_fixture< Kokkos::HostSpace::execution_space >();
+
+#if defined( KOKKOS_ENABLE_CUDA )
+template void test_fixture<Kokkos::Cuda>();
+#endif
+
+#if defined( KOKKOS_ENABLE_ROCM )
+template void test_fixture<Kokkos::Experimental::ROCm>();
+#endif
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
diff --git a/packages/kokkos/example/fixture/TestFixture.hpp b/packages/kokkos/example/fixture/TestFixture.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..12297a2aaa959beb18bbec861f14f02a835c46a2
--- /dev/null
+++ b/packages/kokkos/example/fixture/TestFixture.hpp
@@ -0,0 +1,156 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_TESTFIXTURE_HPP
+#define KOKKOS_EXAMPLE_TESTFIXTURE_HPP
+
+#include <utility>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+#include <BoxElemPart.hpp>
+#include <BoxElemFixture.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template< class Device >
+struct FixtureVerifyElemNodeCoord
+{
+  typedef Device execution_space ;
+
+  typedef struct { size_t success , error ; } value_type ;
+
+  typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ;
+
+  FixtureType m_fixture ;
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const { update.success = update.error = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile       value_type & update ,
+             volatile const value_type & input ) const
+    {
+      update.success += input.success ;
+      update.error += input.error ;
+    }
+  
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t ielem , value_type & update ) const
+  {
+    unsigned node_coord[ FixtureType::ElemNode ][3] ;
+
+    for ( unsigned i = 0 ; i < FixtureType::ElemNode ; ++i ) {
+      const unsigned node_id = m_fixture.elem_node(ielem,i);
+      node_coord[i][0] = m_fixture.node_grid(node_id,0);
+      node_coord[i][1] = m_fixture.node_grid(node_id,1);
+      node_coord[i][2] = m_fixture.node_grid(node_id,2);
+    }
+
+    int error = 0 ;
+    for ( unsigned i = 1 ; i < FixtureType::ElemNode ; ++i ) {
+      if ( node_coord[0][0] + m_fixture.elem_node_local(i,0) != node_coord[i][0] ||
+           node_coord[0][1] + m_fixture.elem_node_local(i,1) != node_coord[i][1] ||
+           node_coord[0][2] + m_fixture.elem_node_local(i,2) != node_coord[i][2] ) {
+        error = 1 ;
+      }
+    }
+
+    if ( error ) {
+      ++update.error ;
+    }
+    else {
+      ++update.success ;
+    }
+  }
+
+  FixtureVerifyElemNodeCoord( const FixtureType & f ) : m_fixture(f) {}
+};
+
+
+template< class Device >
+void test_fixture()
+{
+  typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ;
+
+  const Kokkos::Example::BoxElemPart::Decompose
+    decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ; // DecomposeElem | DecomposeNode ;
+
+  const unsigned global_size = 256 ;
+  const unsigned global_nx = 400 ;
+  const unsigned global_ny = 400 ;
+  const unsigned global_nz = 400 ;
+
+  for ( unsigned my_rank = 0 ; my_rank < global_size ; ++my_rank ) {
+
+    const FixtureType fixture( decompose , global_size , my_rank , global_nx , global_ny , global_nz );
+
+    // Verify grid coordinates of element's nodes
+    
+    typename FixtureVerifyElemNodeCoord<Device>::value_type result = { 0 , 0 };
+
+    Kokkos::parallel_reduce( fixture.elem_node().extent(0) , FixtureVerifyElemNodeCoord<Device>( fixture ) , result );
+
+    if ( result.error ) {
+      std::cout << "P[" << my_rank << ":" << global_size
+                << "] Fixture elem_node_coord"
+                << " success(" << result.success << ")"
+                << " error(" << result.error << ")"
+                << std::endl ;
+    }
+
+    // Check send/recv alignment
+
+
+  }
+}
+
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_TESTFIXTURE_HPP */
+
diff --git a/packages/kokkos/example/global_2_local_ids/CMakeLists.txt b/packages/kokkos/example/global_2_local_ids/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9f32fe580246233f0a5358b5d505abfdeebd0d14
--- /dev/null
+++ b/packages/kokkos/example/global_2_local_ids/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+
+SET(SOURCES
+  G2L_Main.cpp 
+  )
+
+TRIBITS_ADD_EXECUTABLE(
+  global_2_local_ids
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
+
diff --git a/packages/kokkos/example/global_2_local_ids/G2L.hpp b/packages/kokkos/example/global_2_local_ids/G2L.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d40bd76281091d6a2b526b05df10fa024324b40
--- /dev/null
+++ b/packages/kokkos/example/global_2_local_ids/G2L.hpp
@@ -0,0 +1,266 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_GLOBAL_TO_LOCAL_IDS_HPP
+#define KOKKOS_GLOBAL_TO_LOCAL_IDS_HPP
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <vector>
+#include <algorithm>
+#include <iomanip>
+
+#include <impl/Kokkos_Timer.hpp>
+
+// This test will simulate global ids
+
+namespace G2L {
+
+static const unsigned begin_id_size = 256u;
+static const unsigned end_id_size = 1u << 25;
+static const unsigned id_step = 2u;
+
+//use to help generate global ids
+union helper
+{
+  uint32_t word;
+  uint8_t byte[4];
+};
+
+
+//generate a unique global id from the local id
+template <typename Device>
+struct generate_ids
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
+
+  local_id_view local_2_global;
+
+  generate_ids( local_id_view & ids)
+    : local_2_global(ids)
+  {
+    Kokkos::parallel_for(local_2_global.size(), *this);
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+
+    helper x = {static_cast<uint32_t>(i)};
+
+    // shuffle the bytes of i to create a unique, semi-random global_id
+    x.word = ~x.word;
+
+    uint8_t tmp = x.byte[3];
+    x.byte[3] = x.byte[1];
+    x.byte[1] = tmp;
+
+    tmp = x.byte[2];
+    x.byte[2] = x.byte[0];
+    x.byte[0] = tmp;
+
+    local_2_global[i] = x.word;
+  }
+
+};
+
+// fill a map of global_id -> local_id
+template <typename Device>
+struct fill_map
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
+  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
+
+  global_id_view global_2_local;
+  local_id_view local_2_global;
+
+  fill_map( global_id_view gIds, local_id_view lIds)
+    : global_2_local(gIds) , local_2_global(lIds)
+  {
+    Kokkos::parallel_for(local_2_global.size(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+    global_2_local.insert( local_2_global[i], i);
+  }
+
+};
+
+// check that the global id is found and that it maps to the local id
+template <typename Device>
+struct find_test
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
+  typedef Kokkos::UnorderedMap<const uint32_t, const size_type,execution_space> global_id_view;
+
+  global_id_view global_2_local;
+  local_id_view local_2_global;
+
+  typedef size_t value_type;
+
+  find_test( global_id_view gIds, local_id_view lIds, value_type & num_errors)
+    : global_2_local(gIds) , local_2_global(lIds)
+  {
+    Kokkos::parallel_reduce(local_2_global.size(), *this, num_errors);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type & v) const
+  { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type & dst, volatile value_type const & src) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i, value_type & num_errors) const
+  {
+    uint32_t index = global_2_local.find( local_2_global[i] );
+
+    if (  !global_2_local.valid_at(index)
+        || global_2_local.key_at(index) != local_2_global[i]
+        || global_2_local.value_at(index) != i)
+      ++num_errors;
+  }
+
+};
+
+// run test
+template <typename Device>
+size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned num_find_iterations)
+{
+
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+
+  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
+  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
+
+  double elasped_time = 0;
+  Kokkos::Timer timer;
+
+  local_id_view local_2_global("local_ids", num_ids);
+  global_id_view global_2_local(capacity);
+
+  int shiftw = 15;
+
+  //create
+  elasped_time = timer.seconds();
+  std::cout << std::setw(shiftw) <<  "allocate: " <<  elasped_time << std::endl;
+  timer.reset();
+
+  // generate unique ids
+  {
+    generate_ids<Device> gen(local_2_global);
+  }
+
+  // generate
+  elasped_time = timer.seconds();
+  std::cout << std::setw(shiftw) << "generate: " <<  elasped_time << std::endl;
+  timer.reset();
+
+  {
+    fill_map<Device> fill(global_2_local, local_2_global);
+  }
+
+  // fill
+  elasped_time = timer.seconds();
+  std::cout << std::setw(shiftw) << "fill: " <<  elasped_time << std::endl;
+  timer.reset();
+
+
+  size_t num_errors = global_2_local.failed_insert();
+
+  if (num_errors == 0u) {
+    for (unsigned i=0; i<num_find_iterations; ++i)
+    {
+      find_test<Device> find(global_2_local, local_2_global,num_errors);
+    }
+
+    // find
+    elasped_time = timer.seconds();
+    std::cout << std::setw(shiftw) << "lookup: " <<  elasped_time << std::endl;
+  }
+  else {
+    std::cout << "    !!! Fill Failed !!!" << std::endl;
+  }
+
+  return num_errors;
+}
+
+template <typename Device>
+size_t run_test(unsigned num_ids, unsigned num_find_iterations)
+{
+  // expect to fail
+  unsigned capacity = (num_ids*2u)/3u;
+  std::cout << " 66% of needed capacity (should fail)" << std::endl;
+  test_global_to_local_ids<Device>(num_ids, capacity, num_find_iterations);
+
+  //should not fail
+  std::cout << " 100% of needed capacity" << std::endl;
+  capacity = num_ids;
+  size_t num_errors = test_global_to_local_ids<Device>(num_ids, capacity, num_find_iterations);
+
+  //should not fail
+  std::cout << " 150% of needed capacity" << std::endl;
+  capacity = (num_ids*3u)/2u;
+  num_errors += test_global_to_local_ids<Device>(num_ids, capacity, num_find_iterations);
+
+  return num_errors;
+}
+
+
+} // namespace G2L
+
+
+#endif //KOKKOS_GLOBAL_TO_LOCAL_IDS_HPP
+
diff --git a/packages/kokkos/example/global_2_local_ids/G2L_Main.cpp b/packages/kokkos/example/global_2_local_ids/G2L_Main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c8318be9fcdeae7a9dcdcf6c87aaedffa3008944
--- /dev/null
+++ b/packages/kokkos/example/global_2_local_ids/G2L_Main.cpp
@@ -0,0 +1,158 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#include <G2L.hpp>
+
+namespace G2L {
+
+size_t run_serial(unsigned num_ids, unsigned num_find_iterations)
+{
+#ifdef KOKKOS_ENABLE_SERIAL
+  std::cout << "Serial" << std::endl;
+  return run_test<Kokkos::Serial>(num_ids,num_find_iterations);
+#else
+  return 0;
+#endif // KOKKOS_ENABLE_SERIAL
+}
+
+size_t run_threads(unsigned num_ids, unsigned num_find_iterations)
+{
+#ifdef KOKKOS_ENABLE_THREADS
+  std::cout << "Threads" << std::endl;
+  return run_test<Kokkos::Threads>(num_ids,num_find_iterations);
+#else
+  return 0;
+#endif
+}
+
+size_t run_openmp(unsigned num_ids, unsigned num_find_iterations)
+{
+#ifdef KOKKOS_ENABLE_OPENMP
+  std::cout << "OpenMP" << std::endl;
+  return run_test<Kokkos::OpenMP>(num_ids,num_find_iterations);
+#else
+  return 0;
+#endif
+}
+
+size_t run_cuda(unsigned num_ids, unsigned num_find_iterations)
+{
+#ifdef KOKKOS_ENABLE_CUDA
+  std::cout << "Cuda" << std::endl;
+  return run_test<Kokkos::Cuda>(num_ids,num_find_iterations);
+#else
+  return 0;
+#endif
+}
+
+} // namespace G2L
+
+
+int main(int argc, char *argv[])
+{
+  unsigned num_ids = 100000;
+  unsigned num_find_iterations = 1000;
+
+  if (argc == 3) {
+    num_ids = atoi(argv[1]);
+    num_find_iterations = atoi(argv[2]);
+  }
+  else if (argc != 1) {
+    std::cout << argv[0] << " num_ids num_find_iterations" << std::endl;
+    return 0;
+  }
+
+
+  // query the topology of the host
+  unsigned threads_count = 4 ;
+
+  if (Kokkos::hwloc::available()) {
+    threads_count = Kokkos::hwloc::get_available_numa_count() *
+                    Kokkos::hwloc::get_available_cores_per_numa() *
+                    Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+
+  std::cout << "Threads: " << threads_count << std::endl;
+  std::cout << "Number of ids: " << num_ids << std::endl;
+  std::cout << "Number of find iterations: " << num_find_iterations << std::endl;
+
+  size_t num_errors = 0;
+
+  num_errors += G2L::run_serial(num_ids,num_find_iterations);
+
+#ifdef KOKKOS_ENABLE_CUDA
+  Kokkos::HostSpace::execution_space::initialize(threads_count);
+  Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+  num_errors += G2L::run_cuda(num_ids,num_find_iterations);
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+  Kokkos::Threads::initialize( threads_count );
+  num_errors += G2L::run_threads(num_ids,num_find_iterations);
+  Kokkos::Threads::finalize();
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+  int num_threads = 0;
+  #pragma omp parallel
+  {
+    #pragma omp atomic
+    ++num_threads;
+  }
+  if( num_threads > 3 ) {
+    num_threads = std::max(4, num_threads/4);
+  }
+  Kokkos::OpenMP::initialize( num_threads );
+  num_errors += G2L::run_openmp(num_ids,num_find_iterations);
+  Kokkos::OpenMP::finalize();
+#endif
+
+
+  return num_errors;
+}
+
diff --git a/packages/kokkos/example/global_2_local_ids/Makefile b/packages/kokkos/example/global_2_local_ids/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..42b376ec7c5cf73537bf2d49340ce1ca963e3ad1
--- /dev/null
+++ b/packages/kokkos/example/global_2_local_ids/Makefile
@@ -0,0 +1,46 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+  EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+else
+  CXX = g++
+  EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+endif
+
+CXXFLAGS = -O3 -I$(SRC_DIR)
+LINK ?= $(CXX)
+LDFLAGS ?=
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/packages/kokkos/example/grow_array/CMakeLists.txt b/packages/kokkos/example/grow_array/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d9ff17049290af181d4f693cf9936627b28d087e
--- /dev/null
+++ b/packages/kokkos/example/grow_array/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+
+FILE(GLOB SOURCES *.cpp)
+
+TRIBITS_ADD_EXECUTABLE(
+  grow_array
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
diff --git a/packages/kokkos/example/grow_array/Makefile b/packages/kokkos/example/grow_array/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..42b376ec7c5cf73537bf2d49340ce1ca963e3ad1
--- /dev/null
+++ b/packages/kokkos/example/grow_array/Makefile
@@ -0,0 +1,46 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+  EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+else
+  CXX = g++
+  EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+endif
+
+CXXFLAGS = -O3 -I$(SRC_DIR)
+LINK ?= $(CXX)
+LDFLAGS ?=
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/packages/kokkos/example/grow_array/grow_array.hpp b/packages/kokkos/example/grow_array/grow_array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f5d0698d821cb3097c29fa0911faf2deb604e0c6
--- /dev/null
+++ b/packages/kokkos/example/grow_array/grow_array.hpp
@@ -0,0 +1,257 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef EXAMPLE_GROW_ARRAY
+#define EXAMPLE_GROW_ARRAY
+
+#include <cstdlib>
+
+#include <Kokkos_Core.hpp>
+
+#include <algorithm>
+
+#if defined(KOKKOS_ENABLE_CUDA)
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#endif
+
+namespace Example {
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+struct SortView {
+
+  template< typename ValueType >
+  SortView( const Kokkos::View<ValueType*,ExecSpace> v , int begin , int end )
+    {
+      std::sort( v.ptr_on_device() + begin , v.ptr_on_device() + end );
+    }
+};
+
+#if defined(KOKKOS_ENABLE_CUDA)
+template<>
+struct SortView< Kokkos::Cuda > {
+  template< typename ValueType >
+  SortView( const Kokkos::View<ValueType*,Kokkos::Cuda> v , int begin , int end )
+    {
+      thrust::sort( thrust::device_ptr<ValueType>( v.ptr_on_device() + begin )
+                  , thrust::device_ptr<ValueType>( v.ptr_on_device() + end ) );
+    }
+};
+#endif
+
+
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+struct GrowArrayFunctor {
+
+  typedef ExecSpace  execution_space ;
+
+  enum { SHIFT = sizeof(int) == 8 ? 6 : 5 }; // 8 or 4 byte int
+  enum { MASK  = ( 1 << SHIFT ) - 1 };
+
+  const Kokkos::View<int*,ExecSpace>  m_search_flags ; // bit flags for values to append
+  const Kokkos::View<int*,ExecSpace>  m_search_array ; // array to append values
+  const Kokkos::View<int,ExecSpace>   m_search_count ; // offset
+  const int m_search_total ;
+  const int m_search_team_chunk ;
+
+  GrowArrayFunctor( int array_length , int search_length , int print = 1 )
+    : m_search_flags( "flags" , ( search_length + MASK ) >> SHIFT ) // One bit per search entry
+    , m_search_array( "array" , array_length )
+    , m_search_count( "count" )
+    , m_search_total( search_length )
+    , m_search_team_chunk( 2048 )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool flag_is_set( const int index ) const
+    {
+      // 64 or 32 bit integer:
+
+      const int j = index >> SHIFT ; // which integer flag
+      const int k = 1 << ( index & MASK ); // which bit in that integer
+      const int s = ( j < int(m_search_flags.dimension_0()) ) && ( 0 != ( m_search_flags(j) & k ) );
+
+      return s ;
+    }
+
+  typedef typename Kokkos::TeamPolicy<ExecSpace>::member_type team_member ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_member & member ) const
+    {
+      enum { LOCAL_BUFFER_LENGTH = 16 };
+
+      int local_buffer[ LOCAL_BUFFER_LENGTH ] ;
+      int local_count = 0 ;
+
+      // Each team searches 'm_search_team_chunk' indices.
+      // The threads of a team must iterate together because all
+      // threads in the team must call 'team_scan' to prevent deadlock in the team.
+
+            int search_team_begin = member.league_rank() * m_search_team_chunk ;
+      const int search_team_end   = search_team_begin + m_search_team_chunk ;
+
+      int k = 0 ;
+
+      while ( search_team_begin < search_team_end ) {
+
+        // This iteration searches [ search_team_begin .. search_team_begin + member.team_size() ]
+        const int thread_search_index = search_team_begin + member.team_rank();
+
+        // If this thread's search index is in the range
+        // and the flag is set, push into this thread's local buffer.
+        if ( thread_search_index < m_search_total && flag_is_set(thread_search_index) ) {
+          local_buffer[ local_count ] = thread_search_index ;
+          ++local_count ;
+        }
+
+        // Move the team's search range forward
+        search_team_begin += member.team_size(); // Striding team by team size
+
+        // Count number of times a thread's buffer might have grown:
+        ++k ;
+
+        // Write buffer if end of search or a thread might have filled its buffer.
+        if ( k == LOCAL_BUFFER_LENGTH /* A thread in my team might have filled its buffer */ ||
+             ! ( search_team_begin < search_team_end ) /* Team is at the end of its search */ ) {
+
+          // Team's exclusive scan of threads' contributions, with global offset.
+          // This thread writes its buffer into [ team_offset .. team_offset + local_count )
+          const int team_offset = member.team_scan( local_count , & *m_search_count );
+
+          // Copy locally buffered entries into global array:
+          for ( int i = 0 ; i < local_count ; ++i ) {
+            m_search_array( team_offset + i ) = local_buffer[i] ;
+          }
+
+          k = 0 ;
+          local_count = 0 ;
+        }
+      }
+    }
+};
+
+
+template< class ExecSpace >
+void grow_array( int array_length , int search_length , int print = 1 )
+{
+  typedef GrowArrayFunctor< ExecSpace > FunctorType ;
+
+  FunctorType functor( array_length , search_length , print );
+
+  typename Kokkos::View<int,ExecSpace>::HostMirror  count = Kokkos::create_mirror_view( functor.m_search_count );
+  typename Kokkos::View<int*,ExecSpace>::HostMirror flags = Kokkos::create_mirror_view( functor.m_search_flags );
+
+  // Set at most 'array_length' random bits over the search length.
+  for ( int i = 0 ; i < array_length ; ++i ) {
+    // 'lrand48()' generates random number between [0..2^31]
+    // index = ( lrand48() * search_length ) / ( 2^31 )
+    const long int index = ( lrand48() * search_length ) >> 31 ;
+    // set the bit within the flags:
+    flags( index >> FunctorType::SHIFT ) |= ( 1 << ( index & FunctorType::MASK ) );
+  }
+
+  Kokkos::deep_copy( functor.m_search_flags , flags );
+
+  // Each team works on 'functor.m_search_team_chunk' span of the search_length
+  Kokkos::TeamPolicy< ExecSpace >
+    work( /* #teams */ ( search_length + functor.m_search_team_chunk - 1 ) / functor.m_search_team_chunk
+        , /* threads/team */ Kokkos::TeamPolicy< ExecSpace >::team_size_max( functor ) );
+
+  // Fill array:
+  Kokkos::parallel_for( work , functor );
+
+  // How much was filled:
+  Kokkos::deep_copy( count , functor.m_search_count );
+
+  // Sort array:
+  SortView< ExecSpace >( functor.m_search_array , 0 , *count );
+
+  // Mirror the results:
+  typename Kokkos::View<int*,ExecSpace>::HostMirror results = Kokkos::create_mirror_view( functor.m_search_array );
+  Kokkos::deep_copy( results , functor.m_search_array );
+
+  // Verify results:
+  int result_error_count = 0 ;
+  int flags_error_count = 0 ;
+  for ( int i = 0 ; i < *count ; ++i ) {
+    const int index = results(i);
+    const int entry = index >> FunctorType::SHIFT ;
+    const int bit   = 1 << ( index & FunctorType::MASK );
+    const bool flag = 0 != ( flags( entry ) & bit );
+    if ( ! flag ) {
+      if ( print ) std::cerr << "result( " << i << " : " << index << " )";
+      ++result_error_count ;
+    }
+    flags( entry ) &= ~bit ; // Clear that verified bit
+  }
+
+  for ( int i = 0 ; i < int(flags.dimension_0()) ; ++i ) {
+    // If any uncleared bits then an error
+    if ( flags(i) ) {
+      if ( print ) std::cerr << "flags( " << i << " : " << flags(i) << " )" ;
+      ++flags_error_count ;
+    }
+  }
+
+  if ( result_error_count || flags_error_count ) {
+    std::cerr << std::endl << "Example::GrowArrayFunctor( " << array_length
+              << " , " << search_length
+              << " ) result_error_count( " << result_error_count << " )"
+              << " ) flags_error_count( " << flags_error_count << " )"
+              << std::endl ;
+  }
+}
+
+
+} // namespace Example
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef EXAMPLE_GROW_ARRAY */
+
diff --git a/packages/kokkos/example/grow_array/main.cpp b/packages/kokkos/example/grow_array/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..598729459a4c3e3e2140488d0ce08c9894a61703
--- /dev/null
+++ b/packages/kokkos/example/grow_array/main.cpp
@@ -0,0 +1,110 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include <Kokkos_Core.hpp>
+
+#include <grow_array.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+int main( int argc , char ** argv )
+{
+  int num_threads = 4 ;
+  int use_numa = 1 ;
+  int use_core = 1 ;
+  int length_array  = 1000000 ;
+  int span_values = 100000000 ;
+
+
+  if ( Kokkos::hwloc::available() ) {
+    use_numa = Kokkos::hwloc::get_available_numa_count();
+    use_core = Kokkos::hwloc::get_available_cores_per_numa() - 1 ;
+    num_threads = use_numa * use_core * Kokkos::hwloc::get_available_threads_per_core();
+  }
+
+#if defined( KOKKOS_ENABLE_SERIAL )
+  {
+    std::cout << "Kokkos::Serial" << std::endl ;
+    // The Serial device accepts these arguments, though it may ignore them.
+    Kokkos::Serial::initialize( num_threads , use_numa , use_core );
+    Example::grow_array< Kokkos::Serial >( length_array , span_values );
+    Kokkos::Serial::finalize ();
+  }
+#endif // defined( KOKKOS_ENABLE_SERIAL )
+
+#if defined( KOKKOS_ENABLE_THREADS )
+  {
+    std::cout << "Kokkos::Threads" << std::endl ;
+    Kokkos::Threads::initialize( num_threads , use_numa , use_core );
+    Example::grow_array< Kokkos::Threads >( length_array , span_values );
+    Kokkos::Threads::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_OPENMP )
+  {
+    std::cout << "Kokkos::OpenMP" << std::endl ;
+    Kokkos::OpenMP::initialize();
+    Example::grow_array< Kokkos::OpenMP >( length_array , span_values );
+    Kokkos::OpenMP::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  {
+    std::cout << "Kokkos::Cuda" << std::endl ;
+    Kokkos::HostSpace::execution_space::initialize(1);
+    Kokkos::Cuda::initialize();
+    Example::grow_array< Kokkos::Cuda >( length_array , span_values );
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+#endif
+
+  return 0 ;
+}
+
diff --git a/packages/kokkos/example/md_skeleton/CMakeLists.txt b/packages/kokkos/example/md_skeleton/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..28412c37847deb211db5b6256a78a0e904d8dcaf
--- /dev/null
+++ b/packages/kokkos/example/md_skeleton/CMakeLists.txt
@@ -0,0 +1,16 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+SET(LIBRARIES "")
+
+FILE(GLOB SOURCES *.cpp )
+
+TRIBITS_ADD_EXECUTABLE(
+  md_skeleton 
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  DEPLIBS ${LIBRARIES}
+  )
+
diff --git a/packages/kokkos/example/md_skeleton/Makefile b/packages/kokkos/example/md_skeleton/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..42b376ec7c5cf73537bf2d49340ce1ca963e3ad1
--- /dev/null
+++ b/packages/kokkos/example/md_skeleton/Makefile
@@ -0,0 +1,46 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+  EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+else
+  CXX = g++
+  EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+endif
+
+CXXFLAGS = -O3 -I$(SRC_DIR)
+LINK ?= $(CXX)
+LDFLAGS ?=
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/packages/kokkos/example/md_skeleton/README b/packages/kokkos/example/md_skeleton/README
new file mode 100644
index 0000000000000000000000000000000000000000..1ce682b0a6ec64175587d70c593e39ba8d304d75
--- /dev/null
+++ b/packages/kokkos/example/md_skeleton/README
@@ -0,0 +1,3 @@
+To build this example on a 2012-model Macbook Pro with NVIDIA Kepler GPU:
+
+./build.cuda_std g++_osx cuda_osx 30 opt
diff --git a/packages/kokkos/example/md_skeleton/force.cpp b/packages/kokkos/example/md_skeleton/force.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..53a48f3fef28871d6c8b3697b0c702e8cc813ea7
--- /dev/null
+++ b/packages/kokkos/example/md_skeleton/force.cpp
@@ -0,0 +1,184 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <system.h>
+#include <cstdio>
+
+
+/* Simple Lennard Jones Force Kernel using neighborlists
+ * Calculates for every pair of atoms (i,j) with distance smaller r_cut
+ * f_ij = 4*epsilon * ( (sigma/r_ij)^12 - (sigma/r_ij)^6 )
+ * where r_ij is the distance of atoms (i,j).
+ * The force on atom i is the sum over f_ij:
+ * f_i = sum_j (f_ij)
+ * Neighborlists are used in order to pre calculate which atoms j are
+ * close enough to i to be able to contribute. By choosing a larger neighbor
+ * cutoff then the force cutoff, the neighbor list can be reused several times
+ * (typically 10 - 100).
+ */
+
+struct ForceFunctor {
+
+  typedef t_x_array::execution_space execution_space; //Device Type for running the kernel
+  typedef double2 value_type; // When energy calculation is requested return energy, and virial
+
+  t_x_array_randomread x;       //atom positions
+  t_f_array f;                  //atom forces
+  t_int_1d_const numneigh;      //number of neighbors per atom
+  t_neighbors_const neighbors;  //neighborlist
+  double cutforcesq;            //force cutoff
+  double epsilon;               //Potential parameter
+  double sigma6;                //Potential parameter
+
+
+  ForceFunctor(System s) {
+    x = s.d_x;
+    f = s.f;
+    numneigh = s.numneigh;
+    neighbors = s.neighbors;
+    cutforcesq = s.force_cutsq;
+    epsilon = 1.0;
+    sigma6 = 1.0;
+  }
+
+  /* Operator for not calculating energy and virial */
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i) const {
+    force<0>(i);
+  }
+
+  /* Operator for calculating energy and virial */
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i, double2 &energy_virial) const {
+    double2 ev = force<1>(i);
+    energy_virial.x += ev.x;
+    energy_virial.y += ev.y;
+  }
+
+  template<int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  double2 force(const int &i) const
+  {
+    const int numneighs = numneigh[i];
+    const double xtmp = x(i, 0);
+    const double ytmp = x(i, 1);
+    const double ztmp = x(i, 2);
+    double fix = 0;
+    double fiy = 0;
+    double fiz = 0;
+    double energy = 0;
+    double virial = 0;
+
+    //pragma simd forces vectorization (ignoring the performance objections of the compiler)
+    //give hint to compiler that fix, fiy and fiz are used for reduction only
+
+  #ifdef USE_SIMD
+    #pragma simd reduction (+: fix,fiy,fiz,energy,virial)
+  #endif
+    for(int k = 0; k < numneighs; k++) {
+      const int j = neighbors(i, k);
+      const double delx = xtmp - x(j, 0);
+      const double dely = ytmp - x(j, 1);
+      const double delz = ztmp - x(j, 2);
+      const double rsq = delx * delx + dely * dely + delz * delz;
+
+      //if(i==0) printf("%i %i %lf %lf\n",i,j,rsq,cutforcesq);
+      if(rsq < cutforcesq) {
+        const double sr2 = 1.0 / rsq;
+        const double sr6 = sr2 * sr2 * sr2  * sigma6;
+        const double force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+        fix += delx * force;
+        fiy += dely * force;
+        fiz += delz * force;
+
+        if(EVFLAG) {
+          energy += sr6 * (sr6 - 1.0) * epsilon;
+          virial += delx * delx * force + dely * dely * force + delz * delz * force;
+        }
+      }
+    }
+
+    f(i, 0) += fix;
+    f(i, 1) += fiy;
+    f(i, 2) += fiz;
+
+    double2 energy_virial ;
+    energy_virial.x = 4.0 * energy ;
+    energy_virial.y = 0.5 * virial ;
+    return energy_virial;
+  }
+
+  /* init and join functions when doing the reduction to obtain energy and virial */
+
+  KOKKOS_FUNCTION
+  static void init(volatile value_type &update) {
+    update.x = update.y = 0;
+  }
+  KOKKOS_FUNCTION
+  static void join(volatile value_type &update ,
+                   const volatile value_type &source) {
+    update.x += source.x ;
+    update.y += source.y ;
+  }
+
+};
+
+
+/* Calling function */
+
+double2 force(System &s,int evflag) {
+
+  ForceFunctor f(s);
+
+  double2 ev ; ev.x = 0 ; ev.y = 0 ;
+  if(!evflag)
+    Kokkos::parallel_for(s.nlocal,f);
+  else
+    Kokkos::parallel_reduce(s.nlocal,f,ev);
+
+  execution_space::fence();
+  return ev;
+}
+
diff --git a/packages/kokkos/example/md_skeleton/main.cpp b/packages/kokkos/example/md_skeleton/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe783b39cb75e0302cf723d368952b318ae33dda
--- /dev/null
+++ b/packages/kokkos/example/md_skeleton/main.cpp
@@ -0,0 +1,205 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include "system.h"
+
+int create_system(System &system, int nx, int ny, int nz, double rho);
+int neigh_setup(System &system);
+int neigh_build(System &system);
+double2 force(System &system,int evflag);
+
+/* simple MD Skeleton which
+ *   - constructs a simple FCC lattice,
+ *   - computes a neighborlist
+ *   - compute LJ-Force kernel a number of times
+ */
+
+int main(int argc, char** argv) {
+
+  printf("Running MD Skeleton\n");
+  /* Thread numbers for Host */
+
+  int num_threads = 1;
+  int teams = 1;
+  int device = 0; // Default device for GPU runs
+
+  /* avoid unused variable warnings */
+  (void)num_threads;
+  (void)teams;
+  (void)device;
+
+  /* Default value for number of force calculations */
+
+  int iter = 100;
+
+  /* Default value for system size (4*nx*ny*nz atoms)
+   * nx, ny and nz are set to system_size if not specified on commandline */
+
+  int system_size = 20;
+  int nx = -1;
+  int ny = -1;
+  int nz = -1;
+
+  int neighbor_size = 1; // Default bin size for neighbor list construction
+
+  double rho = 0.8442; // Number density of the system
+  double delta = 0; // Scaling factor for random offsets of atom positions
+
+
+  /* read in command-line arguments */
+
+  for(int i = 0; i < argc; i++) {
+    if((strcmp(argv[i], "-t") == 0) || (strcmp(argv[i], "--num_threads") == 0)) {
+      num_threads = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "--teams") == 0)) {
+      teams = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-d") == 0) || (strcmp(argv[i], "--device") == 0))  {
+      device = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "--delta") == 0)) {
+      delta = atof(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-i") == 0) || (strcmp(argv[i], "--iter") == 0))  {
+      iter = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-rho") == 0)) {
+      rho = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--size") == 0)) {
+      system_size = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-nx") == 0)) {
+      nx = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-ny") == 0)) {
+      ny = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-nz") == 0)) {
+      nz = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-b") == 0) || (strcmp(argv[i], "--neigh_bins") == 0))  {
+      neighbor_size = atoi(argv[++i]);
+      continue;
+    }
+  }
+
+  if( nx < 0 ) nx = system_size;
+  if( ny < 0 ) ny = system_size;
+  if( nz < 0 ) nz = system_size;
+
+  printf("-> Init Device\n");
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  Kokkos::HostSpace::execution_space::initialize(teams*num_threads);
+  Kokkos::Cuda::SelectDevice select_device(device);
+  Kokkos::Cuda::initialize(select_device);
+#elif defined( KOKKOS_ENABLE_OPENMP )
+  Kokkos::OpenMP::initialize(teams*num_threads);
+#elif defined( KOKKOS_ENABLE_THREADS )
+  Kokkos::Threads::initialize(teams*num_threads);
+#endif
+
+  System system;
+  system.neigh_cut = 2.8;
+  system.force_cut = 2.5;
+  system.force_cutsq = system.force_cut*system.force_cut;
+  system.delta = delta;
+
+  printf("-> Build system\n");
+  create_system(system,nx,ny,nz,rho);
+
+  printf("-> Created %i atoms and %i ghost atoms\n",system.nlocal,system.nghost);
+
+  system.nbinx = system.box.xprd/neighbor_size+1;
+  system.nbiny = system.box.yprd/neighbor_size+1;
+  system.nbinz = system.box.zprd/neighbor_size+1;
+
+
+  printf("-> Building Neighborlist\n");
+
+  neigh_setup(system);
+  neigh_build(system);
+
+  double2 ev = force(system,1);
+
+  printf("-> Calculate Energy: %f Virial: %f\n",ev.x,ev.y);
+
+  printf("-> Running %i force calculations\n",iter);
+
+  Kokkos::Timer timer;
+
+  for(int i=0;i<iter;i++) {
+    force(system,0);
+  }
+
+
+  double time = timer.seconds();
+  printf("Time: %e s for %i iterations with %i atoms\n",time,iter,system.nlocal);
+
+  execution_space::finalize();
+}
diff --git a/packages/kokkos/example/md_skeleton/neighbor.cpp b/packages/kokkos/example/md_skeleton/neighbor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25cabb33026342da73eddceee5413c3b34662fb2
--- /dev/null
+++ b/packages/kokkos/example/md_skeleton/neighbor.cpp
@@ -0,0 +1,430 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <system.h>
+#include <cstdio>
+#include <Kokkos_Core.hpp>
+
+#define SMALL 1.0e-6
+#define FACTOR 0.999
+
+/* BinningFunctor puts atoms into bins of the simulation box
+ * Neighborlists are then created by checking only distances of atoms
+ * in adjacent bins. That makes neighborlist construction a O(N) operation.
+ */
+
+struct BinningFunctor {
+  typedef t_int_2d::execution_space execution_space;
+
+  System s;
+
+  int atoms_per_bin;
+
+  BinningFunctor(System _s): s(_s) {
+    atoms_per_bin = s.bins.dimension_1();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i) const
+  {
+    const int ibin = coord2bin(s.d_x(i, 0), s.d_x(i, 1), s.d_x(i, 2));
+
+    const int ac = Kokkos::atomic_fetch_add(&s.bincount[ibin], 1);
+
+    if(ac < atoms_per_bin) {
+      s.bins(ibin, ac) = i;
+    } else if(s.d_resize(0) < ac) {
+      s.d_resize(0) = ac;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int coord2bin(double x, double y, double z) const
+  {
+    int ix, iy, iz;
+
+    if(x >= s.box.xprd)
+      ix = (int)((x - s.box.xprd) * s.bininvx) + s.nbinx - s.mbinxlo;
+    else if(x >= 0.0)
+      ix = (int)(x * s.bininvx) - s.mbinxlo;
+    else
+      ix = (int)(x * s.bininvx) - s.mbinxlo - 1;
+
+    if(y >= s.box.yprd)
+      iy = (int)((y - s.box.yprd) * s.bininvy) + s.nbiny - s.mbinylo;
+    else if(y >= 0.0)
+      iy = (int)(y * s.bininvy) - s.mbinylo;
+    else
+      iy = (int)(y * s.bininvy) - s.mbinylo - 1;
+
+    if(z >= s.box.zprd)
+      iz = (int)((z - s.box.zprd) * s.bininvz) + s.nbinz - s.mbinzlo;
+    else if(z >= 0.0)
+      iz = (int)(z * s.bininvz) - s.mbinzlo;
+    else
+      iz = (int)(z * s.bininvz) - s.mbinzlo - 1;
+
+    return (iz * s.mbiny * s.mbinx + iy * s.mbinx + ix + 1);
+  }
+};
+
+/* Build the actual neighborlist*/
+
+struct BuildFunctor {
+
+  typedef t_int_2d::execution_space execution_space;
+
+  System s;
+
+  int maxneighs;
+  BuildFunctor(System _s): s(_s) {
+    maxneighs = s.neighbors.dimension_1();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i) const
+  {
+    int n = 0;
+
+    const t_int_1d_const_um bincount_c = s.bincount;
+
+    const double xtmp = s.d_x(i, 0);
+    const double ytmp = s.d_x(i, 1);
+    const double ztmp = s.d_x(i, 2);
+
+    const int ibin = coord2bin(xtmp, ytmp, ztmp);
+
+    // loop over all bins in neighborhood (includes ibin)
+    for(int k = 0; k < s.nstencil; k++) {
+      const int jbin = ibin + s.d_stencil[k];
+
+      // get subview of jbin
+      const t_int_1d_const_um loc_bin =
+          Kokkos::subview(s.bins,jbin,Kokkos::ALL());
+
+      if(ibin == jbin)
+        for(int m = 0; m < bincount_c[jbin]; m++) {
+          const int j = loc_bin[m];
+
+          //for same bin as atom i skip j if i==j
+          if (j == i) continue;
+
+          const double delx = xtmp - s.d_x(j, 0);
+          const double dely = ytmp - s.d_x(j, 1);
+          const double delz = ztmp - s.d_x(j, 2);
+          const double rsq = delx * delx + dely * dely + delz * delz;
+
+          if(rsq <= s.neigh_cutsq && n<maxneighs) s.neighbors(i,n++) = j;
+        }
+      else {
+        for(int m = 0; m < bincount_c[jbin]; m++) {
+          const int j = loc_bin[m];
+
+          const double delx = xtmp - s.d_x(j, 0);
+          const double dely = ytmp - s.d_x(j, 1);
+          const double delz = ztmp - s.d_x(j, 2);
+          const double rsq = delx * delx + dely * dely + delz * delz;
+
+          if(rsq <= s.neigh_cutsq && n<maxneighs) s.neighbors(i,n++) = j;
+        }
+      }
+    }
+
+    s.numneigh[i] = n;
+
+    if(n >= maxneighs) {
+      if(n >= s.d_resize(0)) s.d_resize(0) = n;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int coord2bin(double x, double y, double z) const
+  {
+    int ix, iy, iz;
+
+    if(x >= s.box.xprd)
+      ix = (int)((x - s.box.xprd) * s.bininvx) + s.nbinx - s.mbinxlo;
+    else if(x >= 0.0)
+      ix = (int)(x * s.bininvx) - s.mbinxlo;
+    else
+      ix = (int)(x * s.bininvx) - s.mbinxlo - 1;
+
+    if(y >= s.box.yprd)
+      iy = (int)((y - s.box.yprd) * s.bininvy) + s.nbiny - s.mbinylo;
+    else if(y >= 0.0)
+      iy = (int)(y * s.bininvy) - s.mbinylo;
+    else
+      iy = (int)(y * s.bininvy) - s.mbinylo - 1;
+
+    if(z >= s.box.zprd)
+      iz = (int)((z - s.box.zprd) * s.bininvz) + s.nbinz - s.mbinzlo;
+    else if(z >= 0.0)
+      iz = (int)(z * s.bininvz) - s.mbinzlo;
+    else
+      iz = (int)(z * s.bininvz) - s.mbinzlo - 1;
+
+    return (iz * s.mbiny * s.mbinx + iy * s.mbinx + ix + 1);
+  }
+};
+
+/* Reset an array to zero */
+
+struct MemsetZeroFunctor {
+  typedef t_x_array::execution_space  execution_space ;
+  void* ptr;
+  KOKKOS_INLINE_FUNCTION void operator()(const int i) const {
+    ((int*)ptr)[i] = 0;
+  }
+};
+
+/* Calculate distance of two bins */
+
+double bindist(System &s, int i, int j, int k)
+{
+  double delx, dely, delz;
+
+  if(i > 0)
+    delx = (i - 1) * s.binsizex;
+  else if(i == 0)
+    delx = 0.0;
+  else
+    delx = (i + 1) * s.binsizex;
+
+  if(j > 0)
+    dely = (j - 1) * s.binsizey;
+  else if(j == 0)
+    dely = 0.0;
+  else
+    dely = (j + 1) * s.binsizey;
+
+  if(k > 0)
+    delz = (k - 1) * s.binsizez;
+  else if(k == 0)
+    delz = 0.0;
+  else
+    delz = (k + 1) * s.binsizez;
+
+  return (delx * delx + dely * dely + delz * delz);
+}
+
+/* Setup the neighborlist construction
+ * Determine binsizes, a stencil for defining adjacency, etc.
+ */
+
+void neigh_setup(System &s) {
+
+  s.neigh_cutsq = s.neigh_cut * s.neigh_cut;
+
+  /*
+  c bins must evenly divide into box size,
+  c   becoming larger than cutneigh if necessary
+  c binsize = 1/2 of cutoff is near optimal
+
+  if (flag == 0) {
+    nbinx = 2.0 * xprd / cutneigh;
+    nbiny = 2.0 * yprd / cutneigh;
+    nbinz = 2.0 * zprd / cutneigh;
+    if (nbinx == 0) nbinx = 1;
+    if (nbiny == 0) nbiny = 1;
+    if (nbinz == 0) nbinz = 1;
+  }
+  */
+
+  s.binsizex = s.box.xprd / s.nbinx;
+  s.binsizey = s.box.yprd / s.nbiny;
+  s.binsizez = s.box.zprd / s.nbinz;
+  s.bininvx = 1.0 / s.binsizex;
+  s.bininvy = 1.0 / s.binsizey;
+  s.bininvz = 1.0 / s.binsizez;
+
+  double coord = s.box.xlo - s.neigh_cut - SMALL * s.box.xprd;
+  s.mbinxlo = static_cast<int>(coord * s.bininvx);
+
+  if(coord < 0.0) s.mbinxlo = s.mbinxlo - 1;
+
+  coord = s.box.xhi + s.neigh_cut + SMALL * s.box.xprd;
+  int mbinxhi = static_cast<int>(coord * s.bininvx);
+
+  coord = s.box.ylo - s.neigh_cut - SMALL * s.box.yprd;
+  s.mbinylo = static_cast<int>(coord * s.bininvy);
+
+  if(coord < 0.0) s.mbinylo = s.mbinylo - 1;
+
+  coord = s.box.yhi + s.neigh_cut + SMALL * s.box.yprd;
+  int mbinyhi = static_cast<int>(coord * s.bininvy);
+
+  coord = s.box.zlo - s.neigh_cut - SMALL * s.box.zprd;
+  s.mbinzlo = static_cast<int>(coord * s.bininvz);
+
+  if(coord < 0.0) s.mbinzlo = s.mbinzlo - 1;
+
+  coord = s.box.zhi + s.neigh_cut + SMALL * s.box.zprd;
+  int mbinzhi = static_cast<int>(coord * s.bininvz);
+
+  /* extend bins by 1 in each direction to insure stencil coverage */
+
+  s.mbinxlo = s.mbinxlo - 1;
+  mbinxhi = mbinxhi + 1;
+  s.mbinx = mbinxhi - s.mbinxlo + 1;
+
+  s.mbinylo = s.mbinylo - 1;
+  mbinyhi = mbinyhi + 1;
+  s.mbiny = mbinyhi - s.mbinylo + 1;
+
+  s.mbinzlo = s.mbinzlo - 1;
+  mbinzhi = mbinzhi + 1;
+  s.mbinz = mbinzhi - s.mbinzlo + 1;
+
+  /*
+  compute bin stencil of all bins whose closest corner to central bin
+  is within neighbor cutoff
+  for partial Newton (newton = 0),
+  stencil is all surrounding bins including self
+  for full Newton (newton = 1),
+  stencil is bins to the "upper right" of central bin, does NOT include self
+  next(xyz) = how far the stencil could possibly extend
+  factor < 1.0 for special case of LJ benchmark so code will create
+  correct-size stencil when there are 3 bins for every 5 lattice spacings
+  */
+
+  int nextx = static_cast<int>(s.neigh_cut * s.bininvx);
+
+  if(nextx * s.binsizex < FACTOR * s.neigh_cut) nextx++;
+
+  int nexty = static_cast<int>(s.neigh_cut * s.bininvy);
+
+  if(nexty * s.binsizey < FACTOR * s.neigh_cut) nexty++;
+
+  int nextz = static_cast<int>(s.neigh_cut * s.bininvz);
+
+  if(nextz * s.binsizez < FACTOR * s.neigh_cut) nextz++;
+
+  int nmax = (2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1);
+  s.d_stencil = t_int_1d("stencil", nmax);
+  s.h_stencil = Kokkos::create_mirror_view(s.d_stencil);
+  s.nstencil = 0;
+  int kstart = -nextz;
+
+  for(int k = kstart; k <= nextz; k++) {
+    for(int j = -nexty; j <= nexty; j++) {
+      for(int i = -nextx; i <= nextx; i++) {
+        if(bindist(s,i, j, k) < s.neigh_cutsq) {
+          s.h_stencil(s.nstencil++) = k * s.mbiny * s.mbinx + j * s.mbinx + i;
+        }
+      }
+    }
+  }
+
+  /* Allocate neighbor arrays */
+
+  Kokkos::deep_copy(s.d_stencil, s.h_stencil);
+  s.mbins = s.mbinx * s.mbiny * s.mbinz;
+  s.bincount = t_int_1d("bincount", s.mbins);
+  s.bins = t_int_2d("bins", s.mbins, 8);
+
+  s.neighbors = t_neighbors("neighbors",s.natoms,80);
+  s.numneigh = t_int_1d("numneigh",s.natoms);
+  s.d_resize = t_int_scalar("resize");
+  s.h_resize = Kokkos::create_mirror_view(s.d_resize);
+}
+
+
+/* Build the neighborlist
+ * This is a try and rerun algorithm for handling the case where the bins array
+ * and the neighbors array are not big enough. So if one is too small, it will
+ * reallocate and rerun the binnind algorithm or the neighborlist construction.
+ */
+
+void neigh_build(System &s) {
+
+  /* Binning of atoms */
+
+  s.h_resize(0) = 1;
+
+  while(s.h_resize(0) > 0) {
+    s.h_resize(0) = 0;
+    Kokkos::deep_copy(s.d_resize, s.h_resize);
+
+    MemsetZeroFunctor f_zero;
+    f_zero.ptr = (void*) s.bincount.ptr_on_device();
+    Kokkos::parallel_for(s.mbins, f_zero);
+    execution_space::fence();
+
+    BinningFunctor f(s);
+    Kokkos::parallel_for(s.natoms, f);
+    execution_space::fence();
+
+    /* Check if bins was large enough, if nor reallocated and rerun */
+
+    deep_copy(s.h_resize, s.d_resize);
+
+    if(s.h_resize(0)) {
+      int atoms_per_bin = s.h_resize(0)+2;
+      s.bins = t_int_2d("bins", s.mbins, atoms_per_bin);
+    }
+  }
+
+  /* Neighborlist construction */
+
+  s.h_resize(0) = 1;
+
+  while(s.h_resize(0)) {
+    s.h_resize(0) = 0;
+
+    Kokkos::deep_copy(s.d_resize, s.h_resize);
+
+    BuildFunctor f(s);
+    Kokkos::parallel_for(s.nlocal, f);
+
+    execution_space::fence();
+
+    /* Check if neighbors was large enough, if nor reallocated and rerun */
+
+    deep_copy(s.h_resize, s.d_resize);
+
+    if(s.h_resize(0)) {
+      int maxneighs = s.h_resize(0) * 1.2;
+      s.neighbors = t_neighbors("neighbors", s.natoms, maxneighs);
+    }
+  }
+}
diff --git a/packages/kokkos/example/md_skeleton/setup.cpp b/packages/kokkos/example/md_skeleton/setup.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52defbc4694c486a8fe2fce1aff47bdb12a0e649
--- /dev/null
+++ b/packages/kokkos/example/md_skeleton/setup.cpp
@@ -0,0 +1,271 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <system.h>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+/* initialize atoms on fcc lattice in parallel fashion */
+
+#define MAX(a,b) (a>b?a:b)
+#define MIN(a,b) (a<b?a:b)
+
+
+int create_system(System &system, int nx, int ny, int nz, double rho)
+{
+  /* Box Setup */
+
+  double lattice = pow((4.0 / rho), (1.0 / 3.0));
+  system.box.xprd = nx * lattice;
+  system.box.yprd = ny * lattice;
+  system.box.zprd = nz * lattice;
+  system.box.xlo = 0;
+  system.box.ylo = 0;
+  system.box.zlo = 0;
+  system.box.xhi = system.box.xprd;
+  system.box.yhi = system.box.yprd;
+  system.box.zhi = system.box.zprd;
+
+
+  int ghost_dist = int(system.neigh_cut/lattice) + 1;
+
+  /* total # of atoms */
+
+  system.nlocal = 4 * nx * ny * nz;
+  system.nghost = 4 * (nx + 2 * ghost_dist) *
+                      (ny + 2 * ghost_dist) *
+                      (nz + 2 * ghost_dist) -
+                      system.nlocal;
+  system.natoms = system.nlocal + system.nghost;
+
+  system.d_x = t_x_array("X",system.natoms);
+  system.h_x = Kokkos::create_mirror_view(system.d_x);
+  system.f = t_f_array("F",system.natoms);
+
+  /* determine loop bounds of lattice subsection that overlaps my sub-box
+     insure loop bounds do not exceed nx,ny,nz */
+
+  double alat = pow((4.0 / rho), (1.0 / 3.0));
+  int ilo = static_cast<int>(system.box.xlo / (0.5 * alat) - 1);
+  int ihi = static_cast<int>(system.box.xhi / (0.5 * alat) + 1);
+  int jlo = static_cast<int>(system.box.ylo / (0.5 * alat) - 1);
+  int jhi = static_cast<int>(system.box.yhi / (0.5 * alat) + 1);
+  int klo = static_cast<int>(system.box.zlo / (0.5 * alat) - 1);
+  int khi = static_cast<int>(system.box.zhi / (0.5 * alat) + 1);
+
+  ilo = MAX(ilo, 0);
+  ihi = MIN(ihi, 2 * nx - 1);
+  jlo = MAX(jlo, 0);
+  jhi = MIN(jhi, 2 * ny - 1);
+  klo = MAX(klo, 0);
+  khi = MIN(khi, 2 * nz - 1);
+
+
+
+  /* generates positions of atoms on fcc sublattice*/
+
+  srand(3718273);
+  /* create non-ghost atoms */
+  {
+    double xtmp, ytmp, ztmp;
+    int sx = 0;
+    int sy = 0;
+    int sz = 0;
+    int ox = 0;
+    int oy = 0;
+    int oz = 0;
+    int subboxdim = 8;
+
+    int n = 0;
+    int iflag = 0;
+
+    while(oz * subboxdim <= khi) {
+      const int k = oz * subboxdim + sz;
+      const int j = oy * subboxdim + sy;
+      const int i = ox * subboxdim + sx;
+
+      if(iflag) continue;
+
+      if(((i + j + k) % 2 == 0) &&
+          (i >= ilo) && (i <= ihi) &&
+          (j >= jlo) && (j <= jhi) &&
+          (k >= klo) && (k <= khi)) {
+
+        const int nold = n;
+        while(nold == n) {
+          xtmp = 0.5 * alat * i + system.delta/1000*(rand()%1000-500);
+          ytmp = 0.5 * alat * j + system.delta/1000*(rand()%1000-500);
+          ztmp = 0.5 * alat * k + system.delta/1000*(rand()%1000-500);
+
+          if(xtmp >= system.box.xlo && xtmp < system.box.xhi &&
+              ytmp >= system.box.ylo && ytmp < system.box.yhi &&
+              ztmp >= system.box.zlo && ztmp < system.box.zhi) {
+            system.h_x(n,0) = xtmp;
+            system.h_x(n,1) = ytmp;
+            system.h_x(n,2) = ztmp;
+            n++;
+          }
+        }
+      }
+
+      sx++;
+
+      if(sx == subboxdim) {
+        sx = 0;
+        sy++;
+      }
+
+      if(sy == subboxdim) {
+        sy = 0;
+        sz++;
+      }
+
+      if(sz == subboxdim) {
+        sz = 0;
+        ox++;
+      }
+
+      if(ox * subboxdim > ihi) {
+        ox = 0;
+        oy++;
+      }
+
+      if(oy * subboxdim > jhi) {
+        oy = 0;
+        oz++;
+      }
+    }
+
+    /* check that correct # of atoms were created */
+
+    if(system.nlocal != n) {
+      printf("Created incorrect # of atoms\n");
+
+      return 1;
+    }
+  }
+
+  /* create ghost atoms */
+
+  {
+    double xtmp, ytmp, ztmp;
+
+    int ilo_g = ilo - 2 * ghost_dist;
+    int jlo_g = jlo - 2 * ghost_dist;
+    int klo_g = klo - 2 * ghost_dist;
+    int ihi_g = ihi + 2 * ghost_dist;
+    int jhi_g = jhi + 2 * ghost_dist;
+    int khi_g = khi + 2 * ghost_dist;
+
+    int subboxdim = 8;
+    int sx = 0;
+    int sy = 0;
+    int sz = 0;
+    int ox = subboxdim * ilo_g;
+    int oy = subboxdim * jlo_g;
+    int oz = subboxdim * klo_g;
+
+    int n = system.nlocal;
+    int iflag = 0;
+
+
+    while(oz * subboxdim <= khi_g) {
+      const int k = oz * subboxdim + sz;
+      const int j = oy * subboxdim + sy;
+      const int i = ox * subboxdim + sx;
+
+      if(iflag) continue;
+
+      if(((i + j + k) % 2 == 0) &&
+          (i >= ilo_g) && (i <= ihi_g) &&
+          (j >= jlo_g) && (j <= jhi_g) &&
+          (k >= klo_g) && (k <= khi_g) &&
+          ((i < ilo) || (i > ihi) ||
+           (j < jlo) || (j > jhi) ||
+           (k < klo) || (k > khi))
+          ) {
+
+        xtmp = 0.5 * alat * i;
+        ytmp = 0.5 * alat * j;
+        ztmp = 0.5 * alat * k;
+
+        system.h_x(n,0) = xtmp + system.delta/1000*(rand()%1000-500);;
+        system.h_x(n,1) = ytmp + system.delta/1000*(rand()%1000-500);;
+        system.h_x(n,2) = ztmp + system.delta/1000*(rand()%1000-500);;
+        n++;
+      }
+
+      sx++;
+
+      if(sx == subboxdim) {
+        sx = 0;
+        sy++;
+      }
+
+      if(sy == subboxdim) {
+        sy = 0;
+        sz++;
+      }
+
+      if(sz == subboxdim) {
+        sz = 0;
+        ox++;
+        //printf("%i %i %i // %i %i %i\n",ox,oy,oz,i,j,k);
+      }
+
+      if(ox * subboxdim > ihi_g) {
+        ox = subboxdim * ilo_g;
+        oy++;
+      }
+
+      if(oy * subboxdim > jhi_g) {
+        oy = subboxdim * jlo_g;
+        oz++;
+      }
+    }
+  }
+
+  Kokkos::deep_copy(system.d_x,system.h_x);
+  return 0;
+}
+
diff --git a/packages/kokkos/example/md_skeleton/system.h b/packages/kokkos/example/md_skeleton/system.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a0708ed340fbdc4c372e64bee613e2db76567a8
--- /dev/null
+++ b/packages/kokkos/example/md_skeleton/system.h
@@ -0,0 +1,92 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef SYSTEM_H_
+#define SYSTEM_H_
+
+#include <types.h>
+
+struct Box {
+  double xprd, yprd, zprd;
+  double xlo, xhi;
+  double ylo, yhi;
+  double zlo, zhi;
+};
+
+struct System {
+  Box box;
+
+  int natoms;
+  int nlocal;
+  int nghost;
+
+  t_x_array d_x;
+  t_x_array_host h_x;
+
+  t_f_array f;
+
+  t_neighbors neighbors;
+  t_int_1d numneigh;
+
+  double delta;
+
+  double neigh_cut,neigh_cutsq;
+
+  int mbins;
+  int nbinx,nbiny,nbinz;
+  int mbinx,mbiny,mbinz;
+  int mbinxlo,mbinylo,mbinzlo;
+  double binsizex,binsizey,binsizez;
+  double bininvx,bininvy,bininvz;
+
+  t_int_1d bincount;
+  t_int_2d bins;
+  t_int_scalar d_resize;
+  t_int_scalar_host h_resize;
+  t_int_1d d_stencil;
+  t_int_1d_host h_stencil;
+  int nstencil;
+
+  double force_cut,force_cutsq;
+};
+#endif
diff --git a/packages/kokkos/example/md_skeleton/types.h b/packages/kokkos/example/md_skeleton/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..71d8c9bca03c8f9ae4ab3672bb6cf2241f14f496
--- /dev/null
+++ b/packages/kokkos/example/md_skeleton/types.h
@@ -0,0 +1,118 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TYPES_H_
+#define TYPES_H_
+
+/* Determine default device type and necessary includes */
+
+#include <Kokkos_Core.hpp>
+
+typedef Kokkos::DefaultExecutionSpace execution_space ;
+
+#if ! defined( KOKKOS_ENABLE_CUDA )
+  struct double2 {
+    double x, y;
+    KOKKOS_INLINE_FUNCTION
+    double2(double xinit, double yinit) {
+      x = xinit;
+      y = yinit;
+    }
+    KOKKOS_INLINE_FUNCTION
+    double2() {
+      x = 0.0;
+      y = 0.0;
+    }
+    KOKKOS_INLINE_FUNCTION
+    double2& operator += (const double2& src) {
+      x+=src.x;
+      y+=src.y;
+      return *this;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    volatile double2& operator += (const volatile double2& src) volatile {
+      x+=src.x;
+      y+=src.y;
+      return *this;
+    }
+
+  };
+#endif
+
+#include <impl/Kokkos_Timer.hpp>
+
+/* Define types used throughout the code */
+
+//Position arrays
+typedef Kokkos::View<double*[3], Kokkos::LayoutRight, execution_space>                                   t_x_array ;
+typedef t_x_array::HostMirror                                                                        t_x_array_host ;
+typedef Kokkos::View<const double*[3], Kokkos::LayoutRight, execution_space>                             t_x_array_const ;
+typedef Kokkos::View<const double*[3], Kokkos::LayoutRight, execution_space, Kokkos::MemoryRandomAccess >  t_x_array_randomread ;
+
+//Force array
+typedef Kokkos::View<double*[3],  execution_space>                                                       t_f_array ;
+
+
+//Neighborlist
+typedef Kokkos::View<int**, execution_space >                                                            t_neighbors ;
+typedef Kokkos::View<const int**, execution_space >                                                      t_neighbors_const ;
+typedef Kokkos::View<int*, execution_space, Kokkos::MemoryUnmanaged >                                    t_neighbors_sub ;
+typedef Kokkos::View<const int*, execution_space, Kokkos::MemoryUnmanaged >                              t_neighbors_const_sub ;
+
+//1d int array
+typedef Kokkos::View<int*, execution_space >                                                             t_int_1d ;
+typedef t_int_1d::HostMirror                                                                         t_int_1d_host ;
+typedef Kokkos::View<const int*, execution_space >                                                       t_int_1d_const ;
+typedef Kokkos::View<int*, execution_space , Kokkos::MemoryUnmanaged>                                    t_int_1d_um ;
+typedef Kokkos::View<const int* , execution_space , Kokkos::MemoryUnmanaged>                             t_int_1d_const_um ;
+
+//2d int array
+typedef Kokkos::View<int**, Kokkos::LayoutRight, execution_space >                                       t_int_2d ;
+typedef t_int_2d::HostMirror                                                                         t_int_2d_host ;
+
+//Scalar ints
+typedef Kokkos::View<int[1], Kokkos::LayoutLeft, execution_space>                                        t_int_scalar ;
+typedef t_int_scalar::HostMirror                                                                     t_int_scalar_host ;
+
+#endif /* TYPES_H_ */
diff --git a/packages/kokkos/example/multi_fem/BoxMeshFixture.hpp b/packages/kokkos/example/multi_fem/BoxMeshFixture.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6fbf1d5a0a05a7d76f45876ee943825383fae861
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/BoxMeshFixture.hpp
@@ -0,0 +1,610 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BOXMESHFIXTURE_HPP
+#define KOKKOS_BOXMESHFIXTURE_HPP
+
+#include <cmath>
+#include <stdexcept>
+#include <sstream>
+
+#include <Kokkos_Core.hpp>
+#include <BoxMeshPartition.hpp>
+#include <FEMesh.hpp>
+#include <HexElement.hpp>
+
+//----------------------------------------------------------------------------
+
+struct FixtureElementHex8 {
+
+  static const unsigned element_node_count = 8 ;
+
+  HybridFEM::HexElement_TensorData< element_node_count > elem_data ;
+  BoxBoundsLinear box_bounds ;
+
+  FixtureElementHex8() : elem_data(), box_bounds() {}
+
+  static void create_node_boxes_from_vertex_boxes(
+    const BoxType                & vertex_box_global ,
+    const std::vector< BoxType > & vertex_box_parts ,
+          BoxType                & node_box_global ,
+          std::vector< BoxType > & node_box_parts )
+  {
+    node_box_global = vertex_box_global ;
+    node_box_parts  = vertex_box_parts  ;
+  }
+
+  void elem_to_node( const unsigned node_local , unsigned coord[] ) const
+  {
+    coord[0] += elem_data.eval_map[ node_local ][0] ;
+    coord[1] += elem_data.eval_map[ node_local ][1] ;
+    coord[2] += elem_data.eval_map[ node_local ][2] ;
+  }
+};
+
+struct FixtureElementHex27 {
+  static const unsigned element_node_count = 27 ;
+
+  HybridFEM::HexElement_TensorData< element_node_count > elem_data ;
+  BoxBoundsQuadratic box_bounds ;
+
+  FixtureElementHex27() : elem_data(), box_bounds() {}
+
+  static void create_node_boxes_from_vertex_boxes(
+    const BoxType                & vertex_box_global ,
+    const std::vector< BoxType > & vertex_box_parts ,
+          BoxType                & node_box_global ,
+          std::vector< BoxType > & node_box_parts )
+  {
+    node_box_global = vertex_box_global ;
+    node_box_parts  = vertex_box_parts  ;
+
+    node_box_global[0][1] = 2 * node_box_global[0][1] - 1 ;
+    node_box_global[1][1] = 2 * node_box_global[1][1] - 1 ;
+    node_box_global[2][1] = 2 * node_box_global[2][1] - 1 ;
+
+    for ( unsigned i = 0 ; i < vertex_box_parts.size() ; ++i ) {
+      node_box_parts[i][0][0] = 2 * node_box_parts[i][0][0] ;
+      node_box_parts[i][1][0] = 2 * node_box_parts[i][1][0] ;
+      node_box_parts[i][2][0] = 2 * node_box_parts[i][2][0] ;
+
+      node_box_parts[i][0][1] =
+        std::min( node_box_global[0][1] , 2 * node_box_parts[i][0][1] );
+      node_box_parts[i][1][1] =
+        std::min( node_box_global[1][1] , 2 * node_box_parts[i][1][1] );
+      node_box_parts[i][2][1] =
+        std::min( node_box_global[2][1] , 2 * node_box_parts[i][2][1] );
+    }
+  }
+
+  void elem_to_node( const unsigned node_local , unsigned coord[] ) const
+  {
+    coord[0] = 2 * coord[0] + elem_data.eval_map[ node_local ][0] ;
+    coord[1] = 2 * coord[1] + elem_data.eval_map[ node_local ][1] ;
+    coord[2] = 2 * coord[2] + elem_data.eval_map[ node_local ][2] ;
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class Device , class ElementSpec >
+struct BoxMeshFixture {
+
+  typedef Scalar  coordinate_scalar_type ;
+  typedef Device  execution_space ;
+
+  static const unsigned element_node_count = ElementSpec::element_node_count ;
+
+  typedef HybridFEM::FEMesh< coordinate_scalar_type ,
+                             element_node_count ,
+                             execution_space > FEMeshType ;
+
+  typedef typename FEMeshType::node_coords_type    node_coords_type ;
+  typedef typename FEMeshType::elem_node_ids_type  elem_node_ids_type ;
+  typedef typename FEMeshType::node_elem_ids_type  node_elem_ids_type ;
+
+
+  static void verify(
+    const typename FEMeshType::node_coords_type::HostMirror   & node_coords ,
+    const typename FEMeshType::elem_node_ids_type::HostMirror & elem_node_ids ,
+    const typename FEMeshType::node_elem_ids_type::HostMirror & node_elem_ids )
+  {
+    typedef typename FEMeshType::size_type         size_type ;
+    //typedef typename node_coords_type::value_type  coords_type ; // unused
+
+    const size_type node_count_total = node_coords.dimension_0();
+    const size_type elem_count_total = elem_node_ids.dimension_0();
+
+    const ElementSpec element ;
+
+    for ( size_type node_index = 0 ;
+                    node_index < node_count_total ; ++node_index ) {
+
+      for ( size_type
+              j = node_elem_ids.row_map[ node_index ] ;
+              j < node_elem_ids.row_map[ node_index + 1 ] ; ++j ) {
+
+        const size_type elem_index = node_elem_ids.entries(j,0);
+        const size_type node_local = node_elem_ids.entries(j,1);
+        const size_type en_id      = elem_node_ids(elem_index,node_local);
+
+        if ( node_index != en_id ) {
+          std::ostringstream msg ;
+          msg << "BoxMeshFixture node_elem_ids error"
+              << " : node_index(" << node_index
+              << ") entry(" << j
+              << ") elem_index(" << elem_index
+              << ") node_local(" << node_local
+              << ") elem_node_id(" << en_id
+              << ")" ;
+          throw std::runtime_error( msg.str() );
+        }
+      }
+    }
+
+    for ( size_type elem_index = 0 ;
+                    elem_index < elem_count_total; ++elem_index ) {
+
+      coordinate_scalar_type elem_node_coord[ element_node_count ][3] ;
+
+      for ( size_type nn = 0 ; nn < element_node_count ; ++nn ) {
+        const size_type node_index = elem_node_ids( elem_index , nn );
+
+        for ( size_type nc = 0 ; nc < 3 ; ++nc ) {
+          elem_node_coord[nn][nc] = node_coords( node_index , nc );
+        }
+      }
+
+
+      for ( size_type nn = 0 ; nn < element_node_count ; ++nn ) {
+
+        const unsigned ix = element.elem_data.eval_map[nn][0] ;
+        const unsigned iy = element.elem_data.eval_map[nn][1] ;
+        const unsigned iz = element.elem_data.eval_map[nn][2] ;
+
+        if ( elem_node_coord[nn][0] != elem_node_coord[0][0] + ix ||
+             elem_node_coord[nn][1] != elem_node_coord[0][1] + iy ||
+             elem_node_coord[nn][2] != elem_node_coord[0][2] + iz ) {
+
+          std::ostringstream msg ;
+          msg << "BoxMeshFixture elem_node_coord mapping failure { "
+              << elem_node_coord[nn][0] << " "
+              << elem_node_coord[nn][1] << " "
+              << elem_node_coord[nn][2] << " } != { "
+              << elem_node_coord[ 0][0] + ix << " "
+              << elem_node_coord[ 0][1] + iy << " "
+              << elem_node_coord[ 0][2] + iz
+              << " }" ;
+          throw std::runtime_error( msg.str() );
+        }
+      }
+    }
+  }
+
+  //------------------------------------
+  // Initialize element-node connectivity:
+  // Order elements that only depend on owned nodes first.
+  // These elements could be computed while waiting for
+  // received node data.
+
+  static void layout_elements_interior_exterior(
+    const BoxType                vertex_box_local_used ,
+    const BoxType                vertex_box_local_owned ,
+    const BoxType                node_box_local_used ,
+    const std::vector<size_t> &  node_used_id_map ,
+    const ElementSpec            element_fixture ,
+    const size_t                 elem_count_interior ,
+    const typename elem_node_ids_type::HostMirror elem_node_ids )
+  {
+    size_t elem_index_interior = 0 ;
+    size_t elem_index_boundary = elem_count_interior ;
+
+    for ( size_t iz = vertex_box_local_used[2][0] ;
+                 iz < vertex_box_local_used[2][1] - 1 ; ++iz ) {
+    for ( size_t iy = vertex_box_local_used[1][0] ;
+                 iy < vertex_box_local_used[1][1] - 1 ; ++iy ) {
+    for ( size_t ix = vertex_box_local_used[0][0] ;
+                 ix < vertex_box_local_used[0][1] - 1 ; ++ix ) {
+
+      size_t elem_index ;
+
+      // If lower and upper vertices are owned then element is interior
+      if ( contain( vertex_box_local_owned, ix,   iy,   iz ) &&
+           contain( vertex_box_local_owned, ix+1, iy+1, iz+1 ) ) {
+        elem_index = elem_index_interior++ ;
+      }
+      else {
+        elem_index = elem_index_boundary++ ;
+      }
+
+      for ( size_t nn = 0 ; nn < element_node_count ; ++nn ) {
+        unsigned coord[3] = { static_cast<unsigned>(ix) , static_cast<unsigned>(iy) , static_cast<unsigned>(iz) };
+
+        element_fixture.elem_to_node( nn , coord );
+
+        const size_t node_local_id =
+          box_map_id( node_box_local_used ,
+                      node_used_id_map ,
+                      coord[0] , coord[1] , coord[2] );
+
+        elem_node_ids( elem_index , nn ) = node_local_id ;
+      }
+    }}}
+  }
+
+  //------------------------------------
+  // Nested partitioning of elements by number of thread 'gangs'
+
+  static void layout_elements_partitioned(
+    const BoxType                vertex_box_local_used ,
+    const BoxType                /*vertex_box_local_owned*/ ,
+    const BoxType                node_box_local_used ,
+    const std::vector<size_t> &  node_used_id_map ,
+    const ElementSpec            element_fixture ,
+    const size_t                 thread_gang_count ,
+    const typename elem_node_ids_type::HostMirror elem_node_ids )
+  {
+    std::vector< BoxType > element_box_gangs( thread_gang_count );
+
+    BoxType element_box_local_used = vertex_box_local_used ;
+
+    element_box_local_used[0][1] -= 1 ;
+    element_box_local_used[1][1] -= 1 ;
+    element_box_local_used[2][1] -= 1 ;
+
+    box_partition_rcb( element_box_local_used , element_box_gangs );
+
+    size_t elem_index = 0 ;
+
+    for ( size_t ig = 0 ; ig < thread_gang_count ; ++ig ) {
+
+      const BoxType box = element_box_gangs[ig] ;
+
+      for ( size_t iz = box[2][0] ; iz < box[2][1] ; ++iz ) {
+      for ( size_t iy = box[1][0] ; iy < box[1][1] ; ++iy ) {
+      for ( size_t ix = box[0][0] ; ix < box[0][1] ; ++ix , ++elem_index ) {
+
+        for ( size_t nn = 0 ; nn < element_node_count ; ++nn ) {
+          unsigned coord[3] = { static_cast<unsigned>(ix) , static_cast<unsigned>(iy) , static_cast<unsigned>(iz) };
+
+          element_fixture.elem_to_node( nn , coord );
+
+          const size_t node_local_id =
+            box_map_id( node_box_local_used ,
+                        node_used_id_map ,
+                        coord[0] , coord[1] , coord[2] );
+
+          elem_node_ids( elem_index , nn ) = node_local_id ;
+        }
+      }}}
+    }
+  }
+
+  //------------------------------------
+
+  static FEMeshType create( const size_t proc_count ,
+                            const size_t proc_local ,
+                            const size_t gang_count ,
+                            const size_t elems_x ,
+                            const size_t elems_y ,
+                            const size_t elems_z ,
+                            const double x_coord_curve = 1 ,
+                            const double y_coord_curve = 1 ,
+                            const double z_coord_curve = 1 )
+  {
+    const size_t vertices_x = elems_x + 1 ;
+    const size_t vertices_y = elems_y + 1 ;
+    const size_t vertices_z = elems_z + 1 ;
+
+    const BoxBoundsLinear vertex_box_bounds ;
+    const ElementSpec element ;
+
+    // Partition based upon vertices:
+
+    BoxType vertex_box_global ;
+    std::vector< BoxType > vertex_box_parts( proc_count );
+
+    vertex_box_global[0][0] = 0 ; vertex_box_global[0][1] = vertices_x ;
+    vertex_box_global[1][0] = 0 ; vertex_box_global[1][1] = vertices_y ;
+    vertex_box_global[2][0] = 0 ; vertex_box_global[2][1] = vertices_z ;
+
+    box_partition_rcb( vertex_box_global , vertex_box_parts );
+
+    const BoxType vertex_box_local_owned = vertex_box_parts[ proc_local ];
+
+    // Determine interior and used vertices:
+
+    BoxType vertex_box_local_interior ;
+    BoxType vertex_box_local_used ;
+
+    vertex_box_bounds.apply( vertex_box_global ,
+                             vertex_box_local_owned ,
+                             vertex_box_local_interior ,
+                             vertex_box_local_used );
+
+    // Element counts:
+
+    const long local_elems_x =
+      ( vertex_box_local_used[0][1] - vertex_box_local_used[0][0] ) - 1 ;
+    const long local_elems_y =
+      ( vertex_box_local_used[1][1] - vertex_box_local_used[1][0] ) - 1 ;
+    const long local_elems_z =
+      ( vertex_box_local_used[2][1] - vertex_box_local_used[2][0] ) - 1 ;
+
+    const size_t elem_count_total = std::max( long(0) , local_elems_x ) *
+                                    std::max( long(0) , local_elems_y ) *
+                                    std::max( long(0) , local_elems_z );
+
+    const long interior_elems_x =
+      ( vertex_box_local_owned[0][1] - vertex_box_local_owned[0][0] ) - 1 ;
+    const long interior_elems_y =
+      ( vertex_box_local_owned[1][1] - vertex_box_local_owned[1][0] ) - 1 ;
+    const long interior_elems_z =
+      ( vertex_box_local_owned[2][1] - vertex_box_local_owned[2][0] ) - 1 ;
+
+    const size_t elem_count_interior = std::max( long(0) , interior_elems_x ) *
+                                       std::max( long(0) , interior_elems_y ) *
+                                       std::max( long(0) , interior_elems_z );
+
+    // Expand vertex boxes to node boxes:
+
+    BoxType node_box_global ;
+    BoxType node_box_local_used ;
+    std::vector< BoxType > node_box_parts ;
+
+    element.create_node_boxes_from_vertex_boxes(
+      vertex_box_global , vertex_box_parts ,
+      node_box_global , node_box_parts );
+
+    // Node communication maps:
+
+    size_t node_count_interior = 0 ;
+    size_t node_count_owned    = 0 ;
+    size_t node_count_total    = 0 ;
+    std::vector<size_t>                 node_used_id_map ;
+    std::vector<size_t>                 node_part_counts ;
+    std::vector< std::vector<size_t> >  node_send_map ;
+
+    box_partition_maps( node_box_global ,
+                        node_box_parts ,
+                        element.box_bounds ,
+                        proc_local ,
+                        node_box_local_used ,
+                        node_used_id_map ,
+                        node_count_interior ,
+                        node_count_owned ,
+                        node_count_total ,
+                        node_part_counts ,
+                        node_send_map );
+
+    size_t node_count_send = 0 ;
+    for ( size_t i = 0 ; i < node_send_map.size() ; ++i ) {
+      node_count_send += node_send_map[i].size();
+    }
+
+    size_t recv_msg_count = 0 ;
+    size_t send_msg_count = 0 ;
+    size_t send_count = 0 ;
+
+    for ( size_t i = 1 ; i < proc_count ; ++i ) {
+      if ( node_part_counts[i] ) ++recv_msg_count ;
+      if ( node_send_map[i].size() ) {
+        ++send_msg_count ;
+        send_count += node_send_map[i].size();
+      }
+    }
+
+    // Finite element mesh:
+
+    FEMeshType mesh ;
+
+    if ( node_count_total ) {
+      mesh.node_coords = node_coords_type( "node_coords", node_count_total );
+    }
+
+    if ( elem_count_total ) {
+      mesh.elem_node_ids =
+        elem_node_ids_type( "elem_node_ids", elem_count_total );
+    }
+
+    mesh.parallel_data_map.assign( node_count_interior ,
+                                   node_count_owned ,
+                                   node_count_total ,
+                                   recv_msg_count ,
+                                   send_msg_count ,
+                                   send_count );
+
+    typename node_coords_type::HostMirror node_coords =
+      Kokkos::create_mirror( mesh.node_coords );
+
+    typename elem_node_ids_type::HostMirror elem_node_ids =
+      Kokkos::create_mirror( mesh.elem_node_ids );
+
+    //------------------------------------
+    // set node coordinates to grid location for subsequent verification
+
+    for ( size_t iz = node_box_local_used[2][0] ;
+                 iz < node_box_local_used[2][1] ; ++iz ) {
+
+    for ( size_t iy = node_box_local_used[1][0] ;
+                 iy < node_box_local_used[1][1] ; ++iy ) {
+
+    for ( size_t ix = node_box_local_used[0][0] ;
+                 ix < node_box_local_used[0][1] ; ++ix ) {
+
+      const size_t node_local_id =
+        box_map_id( node_box_local_used , node_used_id_map , ix , iy , iz );
+
+      node_coords( node_local_id , 0 ) = ix ;
+      node_coords( node_local_id , 1 ) = iy ;
+      node_coords( node_local_id , 2 ) = iz ;
+    }}}
+
+    //------------------------------------
+    // Initialize element-node connectivity:
+
+    if ( 1 < gang_count ) {
+      layout_elements_partitioned( vertex_box_local_used ,
+                                   vertex_box_local_owned ,
+                                   node_box_local_used ,
+                                   node_used_id_map ,
+                                   element ,
+                                   gang_count ,
+                                   elem_node_ids );
+    }
+    else {
+      layout_elements_interior_exterior( vertex_box_local_used ,
+                                         vertex_box_local_owned ,
+                                         node_box_local_used ,
+                                         node_used_id_map ,
+                                         element ,
+                                         elem_count_interior ,
+                                         elem_node_ids );
+    }
+
+    //------------------------------------
+    // Populate node->element connectivity:
+
+    std::vector<size_t> node_elem_work( node_count_total , (size_t) 0 );
+
+    for ( size_t i = 0 ; i < elem_count_total ; ++i ) {
+      for ( size_t n = 0 ; n < element_node_count  ; ++n ) {
+        ++node_elem_work[ elem_node_ids(i,n) ];
+      }
+    }
+
+    mesh.node_elem_ids =
+      Kokkos::create_staticcrsgraph< node_elem_ids_type >( "node_elem_ids" , node_elem_work );
+
+    typename node_elem_ids_type::HostMirror
+      node_elem_ids = Kokkos::create_mirror( mesh.node_elem_ids );
+
+    for ( size_t i = 0 ; i < node_count_total ; ++i ) {
+      node_elem_work[i] = node_elem_ids.row_map[i];
+    }
+
+    // Looping in element order insures the list of elements
+    // is sorted by element index.
+
+    for ( size_t i = 0 ; i < elem_count_total ; ++i ) {
+      for ( size_t n = 0 ; n < element_node_count ; ++n ) {
+        const unsigned nid = elem_node_ids(i, n);
+        const unsigned j = node_elem_work[nid] ; ++node_elem_work[nid] ;
+
+        node_elem_ids.entries( j , 0 ) = i ;
+        node_elem_ids.entries( j , 1 ) = n ;
+      }
+    }
+    //------------------------------------
+    // Verify setup with node coordinates matching grid indices.
+    verify( node_coords , elem_node_ids , node_elem_ids );
+
+    //------------------------------------
+    // Scale node coordinates to problem extent with
+    // nonlinear mapping.
+    {
+      const double problem_extent[3] =
+        { static_cast<double>( vertex_box_global[0][1] - 1 ) ,
+          static_cast<double>( vertex_box_global[1][1] - 1 ) ,
+          static_cast<double>( vertex_box_global[2][1] - 1 ) };
+
+      const double grid_extent[3] =
+        { static_cast<double>( node_box_global[0][1] - 1 ) ,
+          static_cast<double>( node_box_global[1][1] - 1 ) ,
+          static_cast<double>( node_box_global[2][1] - 1 ) };
+
+      for ( size_t i = 0 ; i < node_count_total ; ++i ) {
+        const double x_unit = node_coords(i,0) / grid_extent[0] ;
+        const double y_unit = node_coords(i,1) / grid_extent[1] ;
+        const double z_unit = node_coords(i,2) / grid_extent[2] ;
+
+        node_coords(i,0) = coordinate_scalar_type( problem_extent[0] * std::pow( x_unit , x_coord_curve ) );
+        node_coords(i,1) = coordinate_scalar_type( problem_extent[1] * std::pow( y_unit , y_coord_curve ) );
+        node_coords(i,2) = coordinate_scalar_type( problem_extent[2] * std::pow( z_unit , z_coord_curve ) );
+      }
+    }
+
+    Kokkos::deep_copy( mesh.node_coords ,   node_coords );
+    Kokkos::deep_copy( mesh.elem_node_ids , elem_node_ids );
+    Kokkos::deep_copy( mesh.node_elem_ids.entries , node_elem_ids.entries );
+
+    //------------------------------------
+    // Communication lists:
+    {
+      recv_msg_count = 0 ;
+      send_msg_count = 0 ;
+      send_count = 0 ;
+
+      for ( size_t i = 1 ; i < proc_count ; ++i ) {
+
+        // Order sending starting with the local processor rank
+        // to try to smooth out the amount of messages simultaneously
+        // send to a particular processor.
+
+        const int proc = ( proc_local + i ) % proc_count ;
+        if ( node_part_counts[i] ) {
+          mesh.parallel_data_map.host_recv(recv_msg_count,0) = proc ;
+          mesh.parallel_data_map.host_recv(recv_msg_count,1) = node_part_counts[i] ;
+          ++recv_msg_count ;
+        }
+        if ( node_send_map[i].size() ) {
+          mesh.parallel_data_map.host_send(send_msg_count,0) = proc ;
+          mesh.parallel_data_map.host_send(send_msg_count,1) = node_send_map[i].size() ;
+          for ( size_t j = 0 ; j < node_send_map[i].size() ; ++j , ++send_count ) {
+            mesh.parallel_data_map.host_send_item(send_count) = node_send_map[i][j] - node_count_interior ;
+          }
+          ++send_msg_count ;
+        }
+      }
+    }
+
+    return mesh ;
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_BOXMESHFIXTURE_HPP */
+
+
diff --git a/packages/kokkos/example/multi_fem/BoxMeshPartition.cpp b/packages/kokkos/example/multi_fem/BoxMeshPartition.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99c93e5eb1a74929354a27299711a12725485a6c
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/BoxMeshPartition.cpp
@@ -0,0 +1,381 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <limits>
+#include <BoxMeshPartition.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace {
+
+void box_partition( size_t ip , size_t up ,
+                    const BoxType & box ,
+                    BoxType * const p_box )
+{
+  const size_t np = up - ip ;
+
+  if ( 1 == np ) {
+    p_box[ip] = box ;
+  }
+  else {
+    // Choose axis with largest count:
+
+    const size_t n0 = box[0][1] - box[0][0] ;
+    const size_t n1 = box[1][1] - box[1][0] ;
+    const size_t n2 = box[2][1] - box[2][0] ;
+
+    const size_t axis = n2 > n1 ? ( n2 > n0 ? 2 : ( n1 > n0 ? 1 : 0 ) ) :
+                                  ( n1 > n0 ? 1 : 0 );
+
+    const size_t n = box[ axis ][1] - box[ axis ][0] ;
+
+    if ( 0 == np % 3 ) {
+      const size_t np_part = np / 3 ; // exact
+
+      const size_t nbox_low = (size_t)(( (double) n ) * ( 1.0 / 3.0 ));
+      const size_t nbox_mid = (size_t)(( (double) n ) * ( 2.0 / 3.0 ));
+
+      BoxType dbox_low = box ; // P = [ip,ip+np/3) 
+      BoxType dbox_mid = box ; // P = [ip+np/3,ip+2*np/3) 
+      BoxType dbox_upp = box ; // P = [ip+2*np/3,ip+np) 
+
+      dbox_low[ axis ][1] = box[ axis ][0] + nbox_low ;
+      dbox_mid[ axis ][1] = box[ axis ][0] + nbox_mid ;
+
+      dbox_mid[ axis ][0] = dbox_low[ axis ][1];
+      dbox_upp[ axis ][0] = dbox_mid[ axis ][1];
+
+      box_partition( ip,           ip +   np_part, dbox_low , p_box );
+      box_partition( ip+  np_part, ip + 2*np_part, dbox_mid , p_box );
+      box_partition( ip+2*np_part, up,             dbox_upp , p_box );
+    }
+    else {
+      const size_t np_low = np / 2 ; /* Rounded down */
+      const size_t nbox_low = (size_t)
+        (((double)n) * ( ((double) np_low ) / ((double) np ) ));
+
+      BoxType dbox_low = box ;
+      BoxType dbox_upp = box ;
+
+      dbox_low[ axis ][1] = dbox_low[ axis ][0] + nbox_low ; 
+      dbox_upp[ axis ][0] = dbox_low[ axis ][1];
+
+      box_partition( ip, ip + np_low, dbox_low , p_box );
+      box_partition( ip + np_low, up, dbox_upp , p_box );
+    }
+  }
+}
+
+size_t box_map_offset( const BoxType & local_use ,
+                       const size_t global_i ,
+                       const size_t global_j ,
+                       const size_t global_k )
+
+{
+  const size_t max = std::numeric_limits<size_t>::max();
+
+  const size_t n[3] =
+    { local_use[0][1] - local_use[0][0] ,
+      local_use[1][1] - local_use[1][0] ,
+      local_use[2][1] - local_use[2][0] };
+
+  const size_t use[3] = {
+    ( global_i >= local_use[0][0] ? global_i - local_use[0][0] : max ) ,
+    ( global_j >= local_use[1][0] ? global_j - local_use[1][0] : max ) ,
+    ( global_k >= local_use[2][0] ? global_k - local_use[2][0] : max ) };
+
+  const size_t offset =
+    ( use[0] < n[0] && use[1] < n[1] && use[2] < n[2] ) ?
+    ( use[0] + n[0] * ( use[1] + n[1] * use[2] ) ) : max ;
+
+  if ( offset == max ) {
+    std::ostringstream msg ;
+    msg << "box_map_offset ERROR: "
+        << " use " << local_use
+        << " ( " << global_i
+        << " , " << global_j
+        << " , " << global_k
+        << " )" ;
+    throw std::runtime_error( msg.str() );
+  }
+
+  return offset ;
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+
+void BoxBoundsLinear::apply(  const BoxType & box_global ,
+                              const BoxType & box_part ,
+                                    BoxType & box_interior ,
+                                    BoxType & box_use ) const
+{
+  const unsigned ghost = 1 ;
+
+  if ( 0 == count( box_part ) ) {
+    box_interior = box_part ;
+    box_use      = box_part ;
+  }
+  else {
+    for ( size_t i = 0 ; i < 3 ; ++i ) {
+
+      box_interior[i][0] =
+        ( box_part[i][0] == box_global[i][0] )      ? box_part[i][0] : (
+        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][0] + ghost : 
+                                                      box_part[i][1] );
+
+      box_interior[i][1] =
+        ( box_part[i][1] == box_global[i][1] )      ? box_part[i][1] : (
+        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][1] - ghost :
+                                                      box_part[i][0] );
+
+      box_use[i][0] = 
+        ( box_part[i][0] > ghost + box_global[i][0] ) ? box_part[i][0] - ghost :
+                                                        box_global[i][0] ;
+      box_use[i][1] = 
+        ( box_part[i][1] + ghost < box_global[i][1] ) ? box_part[i][1] + ghost :
+                                                        box_global[i][1] ;
+    }
+  }
+}
+
+void BoxBoundsQuadratic::apply( const BoxType & box_global ,
+                                const BoxType & box_part ,
+                                      BoxType & box_interior ,
+                                      BoxType & box_use ) const
+{
+  if ( 0 == count( box_part ) ) {
+    box_interior = box_part ;
+    box_use      = box_part ;
+  }
+  else {
+    for ( size_t i = 0 ; i < 3 ; ++i ) {
+      const bool odd = ( box_part[i][0] - box_global[i][0] ) & 01 ;
+
+      const unsigned ghost = odd ? 1 : 2 ;
+
+      box_interior[i][0] =
+        ( box_part[i][0] == box_global[i][0] )      ? box_part[i][0] : (
+        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][0] + ghost : 
+                                                      box_part[i][1] );
+
+      box_interior[i][1] =
+        ( box_part[i][1] == box_global[i][1] )      ? box_part[i][1] : (
+        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][1] - ghost :
+                                                      box_part[i][0] );
+
+      box_use[i][0] = 
+        ( box_part[i][0] > ghost + box_global[i][0] ) ? box_part[i][0] - ghost :
+                                                        box_global[i][0] ;
+      box_use[i][1] = 
+        ( box_part[i][1] + ghost < box_global[i][1] ) ? box_part[i][1] + ghost :
+                                                        box_global[i][1] ;
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void box_partition_rcb( const BoxType        & root_box ,
+                        std::vector<BoxType> & part_boxes )
+{
+  const BoxBoundsLinear use_boxes ;
+
+  const size_t part_count = part_boxes.size();
+
+  box_partition( 0 , part_count , root_box , & part_boxes[0] );
+
+  // Verify partitioning
+
+  size_t total_cell = 0 ;
+
+  for ( size_t i = 0 ; i < part_count ; ++i ) {
+
+    total_cell += count( part_boxes[i] );
+
+    BoxType box_interior , box_use ;
+
+    use_boxes.apply( root_box , part_boxes[i] , box_interior , box_use );
+
+    if ( count( box_use ) < count( part_boxes[i] ) ||
+         count( part_boxes[i] ) < count( box_interior ) ||
+         part_boxes[i] != intersect( part_boxes[i] , box_use ) ||
+         box_interior  != intersect( part_boxes[i] , box_interior )) {
+
+      std::ostringstream msg ;
+
+      msg << "box_partition_rcb ERROR : "
+          << "part_boxes[" << i << "] = "
+          << part_boxes[i]
+          << " use " << box_use
+          << " interior " << box_interior
+          << std::endl 
+          << "  part ^ use " << intersect( part_boxes[i] , box_use )
+          << "  part ^ interior " << intersect( part_boxes[i] , box_interior );
+
+      throw std::runtime_error( msg.str() );
+    }
+
+    for ( size_t j = i + 1 ; j < part_count ; ++j ) {
+      const BoxType tmp = intersect( part_boxes[i] , part_boxes[j] );
+
+      if ( count( tmp ) ) {
+        throw std::runtime_error( std::string("box partition intersection") );
+      }
+    }
+  }
+
+  if ( total_cell != count( root_box ) ) {
+    throw std::runtime_error( std::string("box partition count") );
+  }
+}
+
+//----------------------------------------------------------------------------
+         
+size_t box_map_id( const BoxType & local_use ,
+                   const std::vector<size_t> & local_use_id_map ,
+                   const size_t global_i ,
+                   const size_t global_j ,
+                   const size_t global_k )
+
+{
+  const size_t offset =
+    box_map_offset( local_use , global_i , global_j , global_k );
+  return local_use_id_map[ offset ];
+}
+         
+//----------------------------------------------------------------------------
+
+void box_partition_maps( const BoxType              & root_box ,
+                         const std::vector<BoxType> & part_boxes ,
+                         const BoxBounds            & use_boxes ,
+                         const size_t          my_part ,
+                         BoxType             & my_use_box ,
+                         std::vector<size_t> & my_use_id_map ,
+                         size_t              & my_count_interior ,
+                         size_t              & my_count_owned ,
+                         size_t              & my_count_uses ,
+                         std::vector<size_t> & my_part_counts ,
+                         std::vector<std::vector<size_t> > & my_send_map )
+{
+  const size_t np = part_boxes.size();
+
+  if ( np <= my_part ) {
+    std::ostringstream msg ;
+    msg << "box_partition_maps ERROR : "
+        << " np(" << np << ") <= my_part(" << my_part << ")" ;
+    throw std::runtime_error( msg.str() );
+  }
+
+  const BoxType my_owned_box = part_boxes[my_part];
+  BoxType my_interior_box ;
+
+
+  use_boxes.apply( root_box, my_owned_box, my_interior_box, my_use_box );
+
+  my_count_interior = count( my_interior_box );
+  my_count_owned    = count( my_owned_box );
+  my_count_uses     = count( my_use_box );
+
+  my_use_id_map.assign( my_count_uses , std::numeric_limits<size_t>::max() );
+
+  // Order ids as { owned-interior , owned-parallel , received_{(p+i)%np} }
+
+  size_t offset_interior = 0 ;
+  size_t offset_parallel = my_count_interior ;
+
+  for ( size_t iz = my_owned_box[2][0] ; iz < my_owned_box[2][1] ; ++iz ) {
+  for ( size_t iy = my_owned_box[1][0] ; iy < my_owned_box[1][1] ; ++iy ) {
+  for ( size_t ix = my_owned_box[0][0] ; ix < my_owned_box[0][1] ; ++ix ) {
+    const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
+    if ( contain( my_interior_box , ix , iy , iz ) ) {
+      my_use_id_map[ offset ] = offset_interior++ ;
+    }
+    else {
+      my_use_id_map[ offset ] = offset_parallel++ ;
+    }
+  }}}
+
+
+  my_part_counts.assign( np , (size_t) 0 );
+  my_send_map.assign( np , std::vector<size_t>() );
+
+  my_part_counts[0] = my_count_owned ;
+
+  for ( size_t i = 1 ; i < np ; ++i ) {
+
+    const size_t ip = ( my_part + i ) % np ;
+
+    const BoxType p_owned_box = part_boxes[ip];
+    BoxType p_use_box , p_interior_box ;
+    use_boxes.apply( root_box, p_owned_box, p_interior_box, p_use_box );
+
+    const BoxType recv_box = intersect( my_use_box , p_owned_box );
+    const BoxType send_box = intersect( my_owned_box , p_use_box );
+
+    if ( 0 != ( my_part_counts[i] = count( recv_box ) ) ) {
+      for ( size_t iz = recv_box[2][0] ; iz < recv_box[2][1] ; ++iz ) {
+      for ( size_t iy = recv_box[1][0] ; iy < recv_box[1][1] ; ++iy ) {
+      for ( size_t ix = recv_box[0][0] ; ix < recv_box[0][1] ; ++ix ) {
+        const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
+        my_use_id_map[ offset ] = offset_parallel++ ;
+      }}}
+    }
+
+    if ( 0 != count( send_box ) ) {
+      for ( size_t iz = send_box[2][0] ; iz < send_box[2][1] ; ++iz ) {
+      for ( size_t iy = send_box[1][0] ; iy < send_box[1][1] ; ++iy ) {
+      for ( size_t ix = send_box[0][0] ; ix < send_box[0][1] ; ++ix ) {
+        const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
+
+        my_send_map[ i ].push_back( my_use_id_map[ offset ] );
+      }}}
+    }
+  }
+}
+
+
diff --git a/packages/kokkos/example/multi_fem/BoxMeshPartition.hpp b/packages/kokkos/example/multi_fem/BoxMeshPartition.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8f43d2a22d5b47bcd137345a2d7a5e9e06e19ab
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/BoxMeshPartition.hpp
@@ -0,0 +1,210 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef BOXMESHPARTITION_HPP
+#define BOXMESHPARTITION_HPP
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+#include <iostream>
+
+//----------------------------------------------------------------------------
+
+struct BoxType {
+  size_t data[3][2] ;
+
+  typedef size_t range_type[2] ;
+
+  inline
+  const range_type & operator[]( size_t i ) const { return data[i]; }
+
+  inline
+  range_type & operator[]( size_t i ) { return data[i]; }
+
+  inline
+  bool operator == ( const BoxType & rhs ) const
+  {
+    return data[0][0] == rhs.data[0][0] && data[0][1] == rhs.data[0][1] &&
+           data[1][0] == rhs.data[1][0] && data[1][1] == rhs.data[2][1] &&
+           data[2][0] == rhs.data[2][0] && data[2][1] == rhs.data[2][1] ;
+  }
+
+  inline
+  bool operator != ( const BoxType & rhs ) const
+  {
+    return data[0][0] != rhs.data[0][0] || data[0][1] != rhs.data[0][1] ||
+           data[1][0] != rhs.data[1][0] || data[1][1] != rhs.data[1][1] ||
+           data[2][0] != rhs.data[2][0] || data[2][1] != rhs.data[2][1] ;
+  }
+};
+
+inline
+size_t count( const BoxType & b )
+{
+  size_t n = 1 ;
+  for ( size_t i = 0 ; i < 3 ; ++i ) {
+    n *= b[i][1] > b[i][0] ? b[i][1] - b[i][0] : 0 ;
+  }
+  return n ;
+}
+
+inline
+bool contain( const BoxType & b , size_t i , size_t j , size_t k )
+{
+  return b[0][0] <= i && i < b[0][1] &&
+         b[1][0] <= j && j < b[1][1] &&
+         b[2][0] <= k && k < b[2][1] ;
+}
+
+inline
+BoxType intersect( const BoxType & x , const BoxType & y )
+{
+  BoxType z ;
+  for ( size_t i = 0 ; i < 3 ; ++i ) {
+    z[i][0] = std::max( x[i][0] , y[i][0] );    
+    z[i][1] = std::min( x[i][1] , y[i][1] );    
+  }
+
+  return z ;
+}
+
+inline
+std::ostream & operator << ( std::ostream & s , const BoxType & box )
+{
+  s << "{ "
+    << box[0][0] << " " << box[0][1] << " , "
+    << box[1][0] << " " << box[1][1] << " , "
+    << box[2][0] << " " << box[2][1] << " }" ;
+  return s ;
+}
+
+//----------------------------------------------------------------------------
+
+class BoxBounds {
+public:
+  /** \brief  Default bounds to one layer of ghosting */
+  virtual
+  void apply( const BoxType & box_global ,
+              const BoxType & box_part ,
+                    BoxType & box_interior ,
+                    BoxType & box_use ) const = 0 ;
+
+  virtual ~BoxBounds() {}
+  BoxBounds() {}
+};
+
+class BoxBoundsLinear : public BoxBounds
+{
+public:
+  /** \brief  Default bounds to one layer of ghosting */
+  virtual
+  void apply( const BoxType & box_global ,
+              const BoxType & box_part ,
+                    BoxType & box_interior ,
+                    BoxType & box_use ) const ;
+
+  virtual ~BoxBoundsLinear() {}
+  BoxBoundsLinear() {}
+};
+
+class BoxBoundsQuadratic : public BoxBounds {
+public:
+  /** \brief  Quadratic mesh: even ordinates have two layers,
+   *          odd ordinates have one layer.
+   */
+  virtual
+  void apply( const BoxType & box_global ,
+              const BoxType & box_part ,
+                    BoxType & box_interior ,
+                    BoxType & box_use ) const ;
+
+  virtual ~BoxBoundsQuadratic() {}
+  BoxBoundsQuadratic() {}
+};
+
+//----------------------------------------------------------------------------
+/* Partition box into part_boxes.size() sub-boxes */
+
+void box_partition_rcb( const BoxType        & root_box ,
+                        std::vector<BoxType> & part_boxes );
+
+//----------------------------------------------------------------------------
+/* Determine local id layout and communication maps for partitioned boxes.
+ *
+ *  Local ids are layed out as follows:
+ *    { [ owned-interior ids not sent ] ,
+ *      [ owned-boundary ids to be sent to other processes ] ,
+ *      [ received ids from processor ( my_part + 1 ) % part_count ]
+ *      [ received ids from processor ( my_part + 2 ) % part_count ]
+ *      [ received ids from processor ( my_part + 3 ) % part_count ]
+ *      ... };
+ *
+ *  This layout allows
+ *  (1) received data to be copied into a contiguous block of memory
+ *  (2) send data to be extracted from a contiguous block of memory.
+ */
+void box_partition_maps(
+  const BoxType              & root_box ,   // [in] Global box
+  const std::vector<BoxType> & part_boxes , // [in] Partitioned boxes
+  const BoxBounds            & use_boxes ,  // [in] Ghost boundaries
+  const size_t          my_part ,           // [in] My local part
+  BoxType             & my_use_box ,        // [out] My used box with ghost
+  std::vector<size_t> & my_use_id_map ,     // [out] Local ordering map
+  size_t              & my_count_interior , // [out] How many interior
+  size_t              & my_count_owned ,    // [out] How many owned
+  size_t              & my_count_uses ,     // [out] How may used
+  std::vector<size_t> & my_part_counts ,    // [out] Partitioning of my_use_id_map
+  std::vector<std::vector<size_t> > & my_send_map ); // [out] Send id map
+
+/*  Mapping of cartesian coordinate to local id */
+size_t box_map_id( const BoxType             & my_use_box ,
+                   const std::vector<size_t> & my_use_id_map ,
+                   const size_t global_i ,
+                   const size_t global_j ,
+                   const size_t global_k );
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef BOXMESHPARTITION_HPP */
+
diff --git a/packages/kokkos/example/multi_fem/CMakeLists.txt b/packages/kokkos/example/multi_fem/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e3a40bc26f0fb45a12d59ddcfa0f767c3988a6f9
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/CMakeLists.txt
@@ -0,0 +1,16 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+
+FILE(GLOB SOURCES *.cpp)
+
+SET(LIBRARIES kokkoscore)
+
+TRIBITS_ADD_EXECUTABLE(
+  multi_fem
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
diff --git a/packages/kokkos/example/multi_fem/Explicit.hpp b/packages/kokkos/example/multi_fem/Explicit.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..723cab861adb74b6c0d318bac457421530ed650d
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/Explicit.hpp
@@ -0,0 +1,452 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef EXPLICIT_DRIVER_HPP
+#define EXPLICIT_DRIVER_HPP
+
+#include <sys/time.h>
+#include <iostream>
+#include <iomanip>
+#include <cstdlib>
+#include <cmath>
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <ExplicitFunctors.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Explicit {
+
+struct PerformanceData {
+  double mesh_time ;
+  double init_time ;
+  double internal_force_time ;
+  double central_diff ;
+  double comm_time ;
+  size_t number_of_steps ;
+
+  PerformanceData()
+  : mesh_time(0)
+  , init_time(0)
+  , internal_force_time(0)
+  , central_diff(0)
+  , comm_time(0)
+  , number_of_steps(0)
+  {}
+
+  void best( const PerformanceData & rhs )
+  {
+    if ( rhs.mesh_time < mesh_time ) mesh_time = rhs.mesh_time ;
+    if ( rhs.init_time < init_time ) init_time = rhs.init_time ;
+    if ( rhs.internal_force_time < internal_force_time ) internal_force_time = rhs.internal_force_time ;
+    if ( rhs.central_diff < central_diff ) central_diff = rhs.central_diff ;
+    if ( rhs.comm_time < comm_time ) comm_time = rhs.comm_time ;
+  }
+};
+
+template< typename Scalar , class FixtureType >
+PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
+                     const int global_max_x ,
+                     const int global_max_y ,
+                     const int global_max_z ,
+                     const int steps ,
+                     const int print_sample )
+{
+  typedef Scalar                              scalar_type ;
+  typedef FixtureType                         fixture_type ;
+  typedef typename fixture_type::execution_space  execution_space ;
+  //typedef typename fixture_type::FEMeshType   mesh_type ; // unused
+
+  enum { ElementNodeCount = fixture_type::element_node_count };
+
+  const int NumStates = 2;
+
+  const int total_num_steps = steps ;
+
+  const Scalar user_dt = 5.0e-6;
+  //const Scalar  end_time = 0.0050;
+
+  // element block parameters
+  const Scalar  lin_bulk_visc = 0.0;
+  const Scalar  quad_bulk_visc = 0.0;
+
+  // const Scalar  lin_bulk_visc = 0.06;
+  // const Scalar  quad_bulk_visc = 1.2;
+  // const Scalar  hg_stiffness = 0.0;
+  // const Scalar  hg_viscosity = 0.0;
+  // const Scalar  hg_stiffness = 0.03;
+  // const Scalar  hg_viscosity = 0.001;
+
+  // material properties
+  const Scalar youngs_modulus=1.0e6;
+  const Scalar poissons_ratio=0.0;
+  const Scalar  density = 8.0e-4;
+
+  const comm::Machine machine = mesh.parallel_data_map.machine ;
+
+  PerformanceData perf_data ;
+
+  Kokkos::Timer wall_clock ;
+
+  //------------------------------------
+  // Generate fields
+
+  typedef Fields< scalar_type , execution_space > fields_type ;
+
+  fields_type mesh_fields( mesh ,
+                           lin_bulk_visc ,
+                           quad_bulk_visc ,
+                           youngs_modulus ,
+                           poissons_ratio ,
+                           density );
+
+  typename fields_type::node_coords_type::HostMirror
+    model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords );
+
+  typename fields_type::geom_state_array_type::HostMirror
+    displacement_h = Kokkos::create_mirror( mesh_fields.displacement );
+
+  typename fields_type::geom_state_array_type::HostMirror
+    velocity_h = Kokkos::create_mirror( mesh_fields.velocity );
+
+  Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords );
+
+  //------------------------------------
+  // Initialization
+
+  initialize_element<Scalar,execution_space>::apply( mesh_fields );
+  initialize_node<   Scalar,execution_space>::apply( mesh_fields );
+
+  const Scalar x_bc = global_max_x ;
+
+  // Initial condition on velocity to initiate a pulse along the X axis
+  {
+    const unsigned X = 0;
+    for (int inode = 0; inode< mesh_fields.num_nodes; ++inode) {
+      if ( model_coords_h(inode,X) == 0) {
+        velocity_h(inode,X,0) = 1.0e3;
+        velocity_h(inode,X,1) = 1.0e3;
+      }
+    }
+  }
+
+  Kokkos::deep_copy( mesh_fields.velocity , velocity_h );
+
+  //--------------------------------------------------------------------------
+  // We will call a sequence of functions.  These functions have been
+  // grouped into several functors to balance the number of global memory
+  // accesses versus requiring too many registers or too much L1 cache.
+  // Global memory accees have read/write cost and memory subsystem contention cost.
+  //--------------------------------------------------------------------------
+
+  perf_data.init_time = comm::max( machine , wall_clock.seconds() );
+
+  // Parameters required for the internal force computations.
+
+  int current_state = 0;
+  int previous_state = 0;
+  int next_state = 0;
+
+  perf_data.number_of_steps = total_num_steps ;
+
+#if defined( KOKKOS_ENABLE_MPI )
+
+  typedef typename
+    fields_type::geom_state_array_type::value_type  comm_value_type ;
+
+  const unsigned comm_value_count = 6 ;
+
+  Kokkos::AsyncExchange< comm_value_type , execution_space ,
+                              Kokkos::ParallelDataMap >
+    comm_exchange( mesh.parallel_data_map , comm_value_count );
+
+#endif
+
+  for (int step = 0; step < total_num_steps; ++step) {
+
+    wall_clock.reset();
+
+    //------------------------------------------------------------------------
+#if defined( KOKKOS_ENABLE_MPI )
+    {
+      // Communicate "send" nodes' displacement and velocity next_state
+      // to the ghosted nodes.
+      // buffer packages: { { dx , dy , dz , vx , vy , vz }_node }
+
+      pack_state< Scalar , execution_space >
+        ::apply( comm_exchange.buffer() ,
+                 mesh.parallel_data_map.count_interior ,
+                 mesh.parallel_data_map.count_send ,
+                 mesh_fields , next_state );
+
+      comm_exchange.setup();
+
+      comm_exchange.send_receive();
+
+      unpack_state< Scalar , execution_space >
+        ::apply( mesh_fields , next_state ,
+                 comm_exchange.buffer() ,
+                 mesh.parallel_data_map.count_owned ,
+                 mesh.parallel_data_map.count_receive );
+
+      execution_space::fence();
+    }
+#endif
+
+    perf_data.comm_time += comm::max( machine , wall_clock.seconds() );
+
+    //------------------------------------------------------------------------
+    // rotate the states
+
+    previous_state = current_state;
+    current_state = next_state;
+    ++next_state;
+    next_state %= NumStates;
+
+    wall_clock.reset();
+
+    // First kernel 'grad_hgop' combines two functions:
+    // gradient, velocity gradient
+    grad< Scalar , execution_space >::apply( mesh_fields ,
+                                         current_state ,
+                                         previous_state );
+
+    // Combine tensor decomposition and rotation functions.
+    decomp_rotate< Scalar , execution_space >::apply( mesh_fields ,
+                                                  current_state ,
+                                                  previous_state );
+
+    internal_force< Scalar , execution_space >::apply( mesh_fields ,
+                                                   user_dt ,
+                                                   current_state );
+
+    execution_space::fence();
+
+    perf_data.internal_force_time +=
+      comm::max( machine , wall_clock.seconds() );
+
+    wall_clock.reset();
+
+    // Assembly of elements' contributions to nodal force into
+    // a nodal force vector.  Update the accelerations, velocities,
+    // displacements.
+    // The same pattern can be used for matrix-free residual computations.
+    nodal_step< Scalar , execution_space >::apply( mesh_fields ,
+                                               x_bc ,
+                                               current_state,
+                                               next_state );
+    execution_space::fence();
+
+    perf_data.central_diff +=
+      comm::max( machine , wall_clock.seconds() );
+
+    if ( print_sample && 0 == step % 100 ) {
+      Kokkos::deep_copy( displacement_h , mesh_fields.displacement );
+      Kokkos::deep_copy( velocity_h ,     mesh_fields.velocity );
+
+      if ( 1 == print_sample ) {
+
+        std::cout << "step " << step
+                  << " : displacement(*,0,0) =" ;
+        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
+          if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) {
+            std::cout << " " << displacement_h(i,0,next_state);
+          }
+        }
+        std::cout << std::endl ;
+
+        const float tol = 1.0e-6 ;
+        const int yb = global_max_y ;
+        const int zb = global_max_z ;
+        std::cout << "step " << step
+                  << " : displacement(*," << yb << "," << zb << ") =" ;
+        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
+          if ( fabs( model_coords_h(i,1) - yb ) < tol &&
+               fabs( model_coords_h(i,2) - zb ) < tol ) {
+            std::cout << " " << displacement_h(i,0,next_state);
+          }
+        }
+        std::cout << std::endl ;
+      }
+      else if ( 2 == print_sample ) {
+
+        const float tol = 1.0e-6 ;
+        const int xb = global_max_x / 2 ;
+        const int yb = global_max_y / 2 ;
+        const int zb = global_max_z / 2 ;
+
+        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
+          if ( fabs( model_coords_h(i,0) - xb ) < tol &&
+               fabs( model_coords_h(i,1) - yb ) < tol &&
+               fabs( model_coords_h(i,2) - zb ) < tol ) {
+            std::cout << "step " << step
+                      << " : displacement("
+                      << xb << "," << yb << "," << zb << ") = {"
+                      << std::setprecision(6)
+                      << " " << displacement_h(i,0,next_state)
+                      << std::setprecision(2)
+                      << " " << displacement_h(i,1,next_state)
+                      << std::setprecision(2)
+                      << " " << displacement_h(i,2,next_state)
+                      << " }" << std::endl ;
+          }
+        }
+      }
+    }
+  }
+
+  return perf_data ;
+}
+
+
+template <typename Scalar, typename Device>
+static void driver( const char * const label ,
+                    comm::Machine machine ,
+                    const int gang_count ,
+                    const int elem_count_beg ,
+                    const int elem_count_end ,
+                    const int runs )
+{
+  typedef Scalar              scalar_type ;
+  typedef Device              execution_space ;
+  typedef double              coordinate_scalar_type ;
+  typedef FixtureElementHex8  fixture_element_type ;
+
+  typedef BoxMeshFixture< coordinate_scalar_type ,
+                          execution_space ,
+                          fixture_element_type > fixture_type ;
+
+  typedef typename fixture_type::FEMeshType mesh_type ;
+
+  const size_t proc_count = comm::size( machine );
+  const size_t proc_rank  = comm::rank( machine );
+
+  const int space = 15 ;
+  const int steps = 1000 ;
+  const int print_sample = 0 ;
+
+  if ( comm::rank( machine ) == 0 ) {
+
+    std::cout << std::endl ;
+    std::cout << "\"MiniExplicitDynamics with Kokkos " << label
+              << " time_steps(" << steps << ")"
+              << "\"" << std::endl;
+    std::cout << std::left << std::setw(space) << "\"Element\" , ";
+    std::cout << std::left << std::setw(space) << "\"Node\" , ";
+    std::cout << std::left << std::setw(space) << "\"Initialize\" , ";
+    std::cout << std::left << std::setw(space) << "\"ElemForce\" , ";
+    std::cout << std::left << std::setw(space) << "\"NodeUpdate\" , ";
+    std::cout << std::left << std::setw(space) << "\"NodeComm\" , ";
+    std::cout << std::left << std::setw(space) << "\"Time/Elem\" , ";
+    std::cout << std::left << std::setw(space) << "\"Time/Node\"";
+
+    std::cout << std::endl;
+
+    std::cout << std::left << std::setw(space) << "\"count\" , ";
+    std::cout << std::left << std::setw(space) << "\"count\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\"";
+
+    std::cout << std::endl;
+  }
+
+  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
+  {
+    const int iz = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
+    const int iy = iz + 1 ;
+    const int ix = 2 * iy ;
+    const int nelem = ix * iy * iz ;
+    const int nnode = ( ix + 1 ) * ( iy + 1 ) * ( iz + 1 );
+
+    mesh_type mesh =
+      fixture_type::create( proc_count , proc_rank , gang_count ,
+                            ix , iy , iz );
+
+    mesh.parallel_data_map.machine = machine ;
+
+    PerformanceData perf , best ;
+
+    for(int j = 0; j < runs; j++){
+
+     perf = run<scalar_type,fixture_type>(mesh,ix,iy,iz,steps,print_sample);
+
+     if( j == 0 ) {
+       best = perf ;
+     }
+     else {
+       best.best( perf );
+     }
+   }
+
+   if ( comm::rank( machine ) == 0 ) {
+     double time_per_element =
+       ( best.internal_force_time ) / ( nelem * perf.number_of_steps );
+     double time_per_node =
+       ( best.comm_time + best.central_diff ) / ( nnode * perf.number_of_steps );
+
+   std::cout << std::setw(space-3) << nelem << " , "
+             << std::setw(space-3) << nnode << " , "
+             << std::setw(space-3) << best.number_of_steps << " , "
+             << std::setw(space-3) << best.init_time * 1000000 << " , "
+             << std::setw(space-3)
+             << ( best.internal_force_time * 1000000 ) / best.number_of_steps << " , "
+             << std::setw(space-3)
+             << ( best.central_diff * 1000000 ) / best.number_of_steps << " , "
+             << std::setw(space-3)
+             << ( best.comm_time * 1000000 ) / best.number_of_steps << " , "
+             << std::setw(space-3) << time_per_element * 1000000 << " , "
+             << std::setw(space-3) << time_per_node * 1000000
+             << std::endl ;
+    }
+  }
+}
+
+
+} // namespace Explicit
+
+#endif /* #ifndef EXPLICIT_DRIVER_HPP */
diff --git a/packages/kokkos/example/multi_fem/ExplicitFunctors.hpp b/packages/kokkos/example/multi_fem/ExplicitFunctors.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..43d21c6022d516ced0b0b2ce3c809e7be7eb1b60
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/ExplicitFunctors.hpp
@@ -0,0 +1,1471 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPLICITFUNCTORS_HPP
+#define KOKKOS_EXPLICITFUNCTORS_HPP
+
+#include <cmath>
+#include <Kokkos_Core.hpp>
+#include <FEMesh.hpp>
+
+namespace Explicit {
+
+template<typename Scalar , class Device >
+struct Fields {
+
+  static const int NumStates     = 2 ;
+  static const int SpatialDim    = 3 ;
+  static const int ElemNodeCount = 8 ;
+
+  // Indices for full 3x3 tensor:
+
+  static const int K_F_XX = 0 ;
+  static const int K_F_YY = 1 ;
+  static const int K_F_ZZ = 2 ;
+  static const int K_F_XY = 3 ;
+  static const int K_F_YZ = 4 ;
+  static const int K_F_ZX = 5 ;
+  static const int K_F_YX = 6 ;
+  static const int K_F_ZY = 7 ;
+  static const int K_F_XZ = 8 ;
+
+  //  Indexes into a 3 by 3 symmetric tensor stored as a length 6 vector
+
+  static const int K_S_XX = 0 ;
+  static const int K_S_YY = 1 ;
+  static const int K_S_ZZ = 2 ;
+  static const int K_S_XY = 3 ;
+  static const int K_S_YZ = 4 ;
+  static const int K_S_ZX = 5 ;
+  static const int K_S_YX = 3 ;
+  static const int K_S_ZY = 4 ;
+  static const int K_S_XZ = 5 ;
+
+  //  Indexes into a 3 by 3 skew symmetric tensor stored as a length 3 vector
+
+  static const int K_V_XY = 0 ;
+  static const int K_V_YZ = 1 ;
+  static const int K_V_ZX = 2 ;
+
+
+  typedef Device                           execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef HybridFEM::FEMesh<double,ElemNodeCount,execution_space>  FEMesh ;
+
+  typedef typename FEMesh::node_coords_type    node_coords_type ;
+  typedef typename FEMesh::elem_node_ids_type  elem_node_ids_type ;
+  typedef typename FEMesh::node_elem_ids_type  node_elem_ids_type ;
+  typedef typename Kokkos::ParallelDataMap   parallel_data_map ;
+
+  typedef Kokkos::View< double[][ SpatialDim ][ NumStates ] , execution_space > geom_state_array_type ;
+  typedef Kokkos::View< Scalar[][ SpatialDim ] , execution_space > geom_array_type ;
+  typedef Kokkos::View< Scalar[] ,               execution_space > array_type ;
+  typedef Kokkos::View< Scalar ,                 execution_space >  scalar_type ;
+
+  typedef Kokkos::View< Scalar[][  6 ] ,    execution_space >  elem_sym_tensor_type ;
+  typedef Kokkos::View< Scalar[][  9 ] ,    execution_space >  elem_tensor_type ;
+  typedef Kokkos::View< Scalar[][  9 ][ NumStates ] , execution_space >  elem_tensor_state_type ;
+  typedef Kokkos::View< Scalar[][ SpatialDim ][ ElemNodeCount ] , execution_space > elem_node_geom_type ;
+
+  // Parameters:
+  const int num_nodes ;
+  const int num_nodes_owned ;
+  const int num_elements ;
+
+  const Scalar  lin_bulk_visc;
+  const Scalar  quad_bulk_visc;
+  const Scalar  two_mu;
+  const Scalar  bulk_modulus;
+  const Scalar  density;
+
+  // Mesh:
+  const elem_node_ids_type  elem_node_connectivity ;
+  const node_elem_ids_type  node_elem_connectivity ;
+  const node_coords_type    model_coords ;
+
+  // Compute:
+  const scalar_type                dt ;
+  const scalar_type                prev_dt ;
+  const geom_state_array_type      displacement ;
+  const geom_state_array_type      velocity ;
+  const geom_array_type            acceleration ;
+  const geom_array_type            internal_force ;
+  const array_type                 nodal_mass ;
+  const array_type                 elem_mass ;
+  const array_type                 internal_energy ;
+  const elem_sym_tensor_type       stress_new ;
+  const elem_tensor_state_type     rotation ;
+  const elem_node_geom_type        element_force ;
+  const elem_tensor_type           vel_grad ;
+  const elem_sym_tensor_type       stretch ;
+  const elem_sym_tensor_type       rot_stretch ;
+
+  Fields(
+      const FEMesh & mesh,
+      Scalar arg_lin_bulk_visc,
+      Scalar arg_quad_bulk_visc,
+      Scalar youngs_modulus,
+      Scalar poissons_ratio,
+      Scalar arg_density )
+    : num_nodes(       mesh.parallel_data_map.count_owned +
+                       mesh.parallel_data_map.count_receive )
+    , num_nodes_owned( mesh.parallel_data_map.count_owned )
+    , num_elements(    mesh.elem_node_ids.dimension_0() )
+    , lin_bulk_visc(  arg_lin_bulk_visc )
+    , quad_bulk_visc( arg_quad_bulk_visc )
+    , two_mu(youngs_modulus/(1.0+poissons_ratio))
+    , bulk_modulus(youngs_modulus/(3*(1.0-2.0*poissons_ratio)))
+    , density(arg_density)
+
+    // mesh
+
+    , elem_node_connectivity( mesh.elem_node_ids ) // ( num_elements , ElemNodeCount )
+    , node_elem_connectivity( mesh.node_elem_ids ) // ( num_nodes , ... )
+    , model_coords(  mesh.node_coords )            // ( num_nodes , 3 )
+
+    // compute with input/output
+
+    , dt(              "dt" )
+    , prev_dt(         "prev_dt" )
+    , displacement(    "displacement" ,   num_nodes )
+    , velocity(        "velocity" ,       num_nodes )
+    , acceleration(    "acceleration" ,   num_nodes_owned )
+    , internal_force(  "internal_force" , num_nodes_owned )
+    , nodal_mass(      "nodal_mass" ,     num_nodes_owned )
+    , elem_mass(       "elem_mass" ,       num_elements )
+    , internal_energy( "internal_energy" , num_elements )
+    , stress_new(      "stress_new" ,      num_elements )
+
+    // temporary arrays
+
+    , rotation(      "rotation" ,  num_elements )
+    , element_force( "element_force" ,  num_elements )
+    , vel_grad(      "vel_grad" , num_elements )
+    , stretch(       "stretch" , num_elements )
+    , rot_stretch(   "rot_stretch" , num_elements )
+  { }
+};
+
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class DeviceType >
+KOKKOS_INLINE_FUNCTION
+Scalar dot8( const Scalar * a , const Scalar * b )
+{ return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3] +
+         a[4] * b[4] + a[5] * b[5] + a[6] * b[6] + a[7] * b[7] ; }
+
+template< typename Scalar , class DeviceType >
+KOKKOS_INLINE_FUNCTION
+void comp_grad( const Scalar * const x ,
+                const Scalar * const y ,
+                const Scalar * const z,
+                Scalar * const grad_x ,
+                Scalar * const grad_y ,
+                Scalar * const grad_z )
+{
+  //  calc X difference vectors
+
+  Scalar R42=(x[3] - x[1]);
+  Scalar R52=(x[4] - x[1]);
+  Scalar R54=(x[4] - x[3]);
+
+  Scalar R63=(x[5] - x[2]);
+  Scalar R83=(x[7] - x[2]);
+  Scalar R86=(x[7] - x[5]);
+
+  Scalar R31=(x[2] - x[0]);
+  Scalar R61=(x[5] - x[0]);
+  Scalar R74=(x[6] - x[3]);
+
+  Scalar R72=(x[6] - x[1]);
+  Scalar R75=(x[6] - x[4]);
+  Scalar R81=(x[7] - x[0]);
+
+  Scalar t1=(R63 + R54);
+  Scalar t2=(R61 + R74);
+  Scalar t3=(R72 + R81);
+
+  Scalar t4 =(R86 + R42);
+  Scalar t5 =(R83 + R52);
+  Scalar t6 =(R75 + R31);
+
+  //  Calculate Y gradient from X and Z data
+
+  grad_y[0] = (z[1] *  t1) - (z[2] * R42) - (z[3] *  t5)  + (z[4] *  t4) + (z[5] * R52) - (z[7] * R54);
+  grad_y[1] = (z[2] *  t2) + (z[3] * R31) - (z[0] *  t1)  - (z[5] *  t6) + (z[6] * R63) - (z[4] * R61);
+  grad_y[2] = (z[3] *  t3) + (z[0] * R42) - (z[1] *  t2)  - (z[6] *  t4) + (z[7] * R74) - (z[5] * R72);
+  grad_y[3] = (z[0] *  t5) - (z[1] * R31) - (z[2] *  t3)  + (z[7] *  t6) + (z[4] * R81) - (z[6] * R83);
+  grad_y[4] = (z[5] *  t3) + (z[6] * R86) - (z[7] *  t2)  - (z[0] *  t4) - (z[3] * R81) + (z[1] * R61);
+  grad_y[5] = (z[6] *  t5) - (z[4] *  t3)  - (z[7] * R75) + (z[1] *  t6) - (z[0] * R52) + (z[2] * R72);
+  grad_y[6] = (z[7] *  t1) - (z[5] *  t5)  - (z[4] * R86) + (z[2] *  t4) - (z[1] * R63) + (z[3] * R83);
+  grad_y[7] = (z[4] *  t2) - (z[6] *  t1)  + (z[5] * R75) - (z[3] *  t6) - (z[2] * R74) + (z[0] * R54);
+
+  //   calc Z difference vectors
+
+  R42=(z[3] - z[1]);
+  R52=(z[4] - z[1]);
+  R54=(z[4] - z[3]);
+
+  R63=(z[5] - z[2]);
+  R83=(z[7] - z[2]);
+  R86=(z[7] - z[5]);
+
+  R31=(z[2] - z[0]);
+  R61=(z[5] - z[0]);
+  R74=(z[6] - z[3]);
+
+  R72=(z[6] - z[1]);
+  R75=(z[6] - z[4]);
+  R81=(z[7] - z[0]);
+
+  t1=(R63 + R54);
+  t2=(R61 + R74);
+  t3=(R72 + R81);
+
+  t4 =(R86 + R42);
+  t5 =(R83 + R52);
+  t6 =(R75 + R31);
+
+  //  Calculate X gradient from Y and Z data
+
+  grad_x[0] = (y[1] *  t1) - (y[2] * R42) - (y[3] *  t5) + (y[4] *  t4) + (y[5] * R52) - (y[7] * R54);
+  grad_x[1] = (y[2] *  t2) + (y[3] * R31) - (y[0] *  t1) - (y[5] *  t6) + (y[6] * R63) - (y[4] * R61);
+  grad_x[2] = (y[3] *  t3) + (y[0] * R42) - (y[1] *  t2) - (y[6] *  t4) + (y[7] * R74) - (y[5] * R72);
+  grad_x[3] = (y[0] *  t5) - (y[1] * R31) - (y[2] *  t3) + (y[7] *  t6) + (y[4] * R81) - (y[6] * R83);
+  grad_x[4] = (y[5] *  t3) + (y[6] * R86) - (y[7] *  t2) - (y[0] *  t4) - (y[3] * R81) + (y[1] * R61);
+  grad_x[5] = (y[6] *  t5) - (y[4] *  t3) - (y[7] * R75) + (y[1] *  t6) - (y[0] * R52) + (y[2] * R72);
+  grad_x[6] = (y[7] *  t1) - (y[5] *  t5) - (y[4] * R86) + (y[2] *  t4) - (y[1] * R63) + (y[3] * R83);
+  grad_x[7] = (y[4] *  t2) - (y[6] *  t1) + (y[5] * R75) - (y[3] *  t6) - (y[2] * R74) + (y[0] * R54);
+
+  //  calc Y difference vectors
+
+  R42=(y[3] - y[1]);
+  R52=(y[4] - y[1]);
+  R54=(y[4] - y[3]);
+
+  R63=(y[5] - y[2]);
+  R83=(y[7] - y[2]);
+  R86=(y[7] - y[5]);
+
+  R31=(y[2] - y[0]);
+  R61=(y[5] - y[0]);
+  R74=(y[6] - y[3]);
+
+  R72=(y[6] - y[1]);
+  R75=(y[6] - y[4]);
+  R81=(y[7] - y[0]);
+
+  t1=(R63 + R54);
+  t2=(R61 + R74);
+  t3=(R72 + R81);
+
+  t4 =(R86 + R42);
+  t5 =(R83 + R52);
+  t6 =(R75 + R31);
+
+  //  Calculate Z gradient from X and Y data
+
+  grad_z[0] = (x[1] *  t1) - (x[2] * R42) - (x[3] *  t5)  + (x[4] *  t4) + (x[5] * R52) - (x[7] * R54);
+  grad_z[1] = (x[2] *  t2) + (x[3] * R31) - (x[0] *  t1)  - (x[5] *  t6) + (x[6] * R63) - (x[4] * R61);
+  grad_z[2] = (x[3] *  t3) + (x[0] * R42) - (x[1] *  t2)  - (x[6] *  t4) + (x[7] * R74) - (x[5] * R72);
+  grad_z[3] = (x[0] *  t5) - (x[1] * R31) - (x[2] *  t3)  + (x[7] *  t6) + (x[4] * R81) - (x[6] * R83);
+  grad_z[4] = (x[5] *  t3) + (x[6] * R86) - (x[7] *  t2)  - (x[0] *  t4) - (x[3] * R81) + (x[1] * R61);
+  grad_z[5] = (x[6] *  t5) - (x[4] *  t3)  - (x[7] * R75) + (x[1] *  t6) - (x[0] * R52) + (x[2] * R72);
+  grad_z[6] = (x[7] *  t1) - (x[5] *  t5)  - (x[4] * R86) + (x[2] *  t4) - (x[1] * R63) + (x[3] * R83);
+  grad_z[7] = (x[4] *  t2) - (x[6] *  t1)  + (x[5] * R75) - (x[3] *  t6) - (x[2] * R74) + (x[0] * R54);
+}
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class DeviceType >
+struct initialize_element
+{
+  typedef DeviceType     execution_space ;
+
+  typedef Explicit::Fields< Scalar , execution_space > Fields ;
+
+  typename Fields::elem_node_ids_type      elem_node_connectivity ;
+  typename Fields::node_coords_type        model_coords ;
+  typename Fields::elem_sym_tensor_type    stretch ;
+  typename Fields::elem_tensor_state_type  rotation ;
+  typename Fields::array_type              elem_mass ;
+
+  const Scalar density ;
+
+  initialize_element( const Fields & mesh_fields )
+    : elem_node_connectivity( mesh_fields.elem_node_connectivity )
+    , model_coords(           mesh_fields.model_coords )
+    , stretch(                mesh_fields.stretch )
+    , rotation(               mesh_fields.rotation )
+    , elem_mass(              mesh_fields.elem_mass )
+    , density(                mesh_fields.density )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ielem )const
+  {
+    const int K_XX = 0 ;
+    const int K_YY = 1 ;
+    const int K_ZZ = 2 ;
+    const Scalar ONE12TH = 1.0 / 12.0 ;
+
+    Scalar x[ Fields::ElemNodeCount ];
+    Scalar y[ Fields::ElemNodeCount ];
+    Scalar z[ Fields::ElemNodeCount ];
+    Scalar grad_x[ Fields::ElemNodeCount ];
+    Scalar grad_y[ Fields::ElemNodeCount ];
+    Scalar grad_z[ Fields::ElemNodeCount ];
+
+    for ( int i = 0 ; i < Fields::ElemNodeCount ; ++i ) {
+      const int n = elem_node_connectivity( ielem , i );
+
+      x[i]  = model_coords( n , 0 );
+      y[i]  = model_coords( n , 1 );
+      z[i]  = model_coords( n , 2 );
+    }
+
+    comp_grad<Scalar,execution_space>( x, y, z, grad_x, grad_y, grad_z);
+
+    stretch(ielem,K_XX) = 1 ;
+    stretch(ielem,K_YY) = 1 ;
+    stretch(ielem,K_ZZ) = 1 ;
+
+    rotation(ielem,K_XX,0) = 1 ;
+    rotation(ielem,K_YY,0) = 1 ;
+    rotation(ielem,K_ZZ,0) = 1 ;
+
+    rotation(ielem,K_XX,1) = 1 ;
+    rotation(ielem,K_YY,1) = 1 ;
+    rotation(ielem,K_ZZ,1) = 1 ;
+
+    elem_mass(ielem) = ONE12TH * density *
+                                 dot8<Scalar,execution_space>( x , grad_x );
+  }
+
+  static void apply( const Fields & mesh_fields )
+  {
+    initialize_element op( mesh_fields );
+    Kokkos::parallel_for( mesh_fields.num_elements , op );
+  }
+};
+
+
+template<typename Scalar , class DeviceType >
+struct initialize_node
+{
+  typedef DeviceType     execution_space ;
+
+  typedef Explicit::Fields< Scalar , execution_space > Fields ;
+
+  typename Fields::node_elem_ids_type      node_elem_connectivity ;
+  typename Fields::array_type              nodal_mass ;
+  typename Fields::array_type              elem_mass ;
+
+  static const int ElemNodeCount = Fields::ElemNodeCount ;
+
+  initialize_node( const Fields & mesh_fields )
+    : node_elem_connectivity( mesh_fields.node_elem_connectivity )
+    , nodal_mass(             mesh_fields.nodal_mass )
+    , elem_mass(              mesh_fields.elem_mass )
+    {}
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int inode )const
+  {
+    const int begin = node_elem_connectivity.row_map[inode];
+    const int end   = node_elem_connectivity.row_map[inode+1];
+
+    Scalar node_mass = 0;
+
+    for(int i = begin; i != end; ++i) {
+      const int elem_id = node_elem_connectivity.entries( i , 0 );
+      node_mass += elem_mass(elem_id);
+    }
+
+    nodal_mass(inode) = node_mass / ElemNodeCount ;
+  }
+
+  static void apply( const Fields & mesh_fields )
+  {
+    initialize_node op( mesh_fields );
+    Kokkos::parallel_for( mesh_fields.num_nodes_owned , op );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+
+template<typename Scalar, class DeviceType >
+struct grad
+{
+  typedef DeviceType execution_space ;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  static const int ElemNodeCount = Fields::ElemNodeCount ;
+
+  static const int K_F_XX = Fields::K_F_XX ;
+  static const int K_F_YY = Fields::K_F_YY ;
+  static const int K_F_ZZ = Fields::K_F_ZZ ;
+  static const int K_F_XY = Fields::K_F_XY ;
+  static const int K_F_YZ = Fields::K_F_YZ ;
+  static const int K_F_ZX = Fields::K_F_ZX ;
+  static const int K_F_YX = Fields::K_F_YX ;
+  static const int K_F_ZY = Fields::K_F_ZY ;
+  static const int K_F_XZ = Fields::K_F_XZ ;
+
+  // Global arrays used by this functor.
+
+  const typename Fields::elem_node_ids_type     elem_node_connectivity ;
+  const typename Fields::node_coords_type       model_coords ;
+  const typename Fields::geom_state_array_type  displacement ;
+  const typename Fields::geom_state_array_type  velocity ;
+  const typename Fields::elem_tensor_type       vel_grad ;
+  const typename Fields::scalar_type            dt ;
+
+  const int  current_state;
+  const int  previous_state;
+
+  // Constructor on the Host to populate this device functor.
+  // All array view copies are shallow.
+  grad( const Fields &  fields,
+        const int arg_current_state,
+        const int arg_previous_state)
+    : elem_node_connectivity( fields.elem_node_connectivity)
+    , model_coords( fields.model_coords)
+    , displacement( fields.displacement)
+    , velocity( fields.velocity)
+    , vel_grad( fields.vel_grad)
+    , dt(  fields.dt)
+    , current_state(arg_current_state)
+    , previous_state(arg_previous_state)
+    { }
+
+  //--------------------------------------------------------------------------
+
+    //   Calculate Velocity Gradients
+    KOKKOS_INLINE_FUNCTION
+    void v_grad(  int ielem,
+      Scalar * vx,       Scalar * vy,       Scalar * vz,
+      Scalar * grad_x,     Scalar * grad_y,     Scalar * grad_z,
+      Scalar inv_vol) const
+    {
+      const int K_F_XX = Fields::K_F_XX ;
+      const int K_F_YY = Fields::K_F_YY ;
+      const int K_F_ZZ = Fields::K_F_ZZ ;
+      const int K_F_XY = Fields::K_F_XY ;
+      const int K_F_YZ = Fields::K_F_YZ ;
+      const int K_F_ZX = Fields::K_F_ZX ;
+      const int K_F_YX = Fields::K_F_YX ;
+      const int K_F_ZY = Fields::K_F_ZY ;
+      const int K_F_XZ = Fields::K_F_XZ ;
+
+      vel_grad(ielem, K_F_XX) = inv_vol * dot8<Scalar,execution_space>( vx , grad_x );
+      vel_grad(ielem, K_F_YX) = inv_vol * dot8<Scalar,execution_space>( vy , grad_x );
+      vel_grad(ielem, K_F_ZX) = inv_vol * dot8<Scalar,execution_space>( vz , grad_x );
+
+      vel_grad(ielem, K_F_XY) = inv_vol * dot8<Scalar,execution_space>( vx , grad_y );
+      vel_grad(ielem, K_F_YY) = inv_vol * dot8<Scalar,execution_space>( vy , grad_y );
+      vel_grad(ielem, K_F_ZY) = inv_vol * dot8<Scalar,execution_space>( vz , grad_y );
+
+      vel_grad(ielem, K_F_XZ) = inv_vol * dot8<Scalar,execution_space>( vx , grad_z );
+      vel_grad(ielem, K_F_YZ) = inv_vol * dot8<Scalar,execution_space>( vy , grad_z );
+      vel_grad(ielem, K_F_ZZ) = inv_vol * dot8<Scalar,execution_space>( vz , grad_z );
+    }
+
+  //--------------------------------------------------------------------------
+  // Functor operator() which calls the three member functions.
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ielem )const
+  {
+    const int X = 0 ;
+    const int Y = 1 ;
+    const int Z = 2 ;
+    const Scalar dt_scale = -0.5 * *dt;
+
+    //  declare and reuse local data for frequently accessed data to
+    //  reduce global memory reads and writes.
+
+    Scalar      x[8],      y[8],      z[8];
+    Scalar     vx[8],     vy[8],     vz[8];
+    Scalar grad_x[8], grad_y[8], grad_z[8];
+
+    // Read global velocity once and use many times
+    // via local registers / L1 cache.
+    //  store the velocity information in local memory before using,
+    //  so it can be returned for other functions to use
+
+    // Read global coordinates and velocity once and use many times
+    // via local registers / L1 cache.
+    // load X coordinate information and move by half time step
+
+    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+      const int n = elem_node_connectivity( ielem , i );
+
+      vx[i] = velocity( n , X , current_state );
+      vy[i] = velocity( n , Y , current_state );
+      vz[i] = velocity( n , Z , current_state );
+
+      x[i]  = model_coords( n , X ) +
+              displacement( n , X , current_state ) +
+              dt_scale * vx[i];
+
+      y[i]  = model_coords( n , Y ) +
+              displacement( n , Y , current_state ) +
+              dt_scale * vy[i];
+
+      z[i]  = model_coords( n , Z ) +
+              displacement( n , Z , current_state ) +
+              dt_scale * vz[i];
+    }
+
+    comp_grad<Scalar,execution_space>( x, y, z, grad_x, grad_y, grad_z);
+
+    //  Calculate hexahedral volume from x model_coords and gradient information
+
+    const Scalar inv_vol = 1.0 / dot8<Scalar,execution_space>( x , grad_x );
+
+    v_grad(ielem, vx, vy, vz, grad_x, grad_y, grad_z, inv_vol);
+  }
+
+  static void apply( const Fields & fields ,
+                     const int arg_current_state ,
+                     const int arg_previous_state )
+  {
+    grad op( fields, arg_current_state , arg_previous_state );
+    Kokkos::parallel_for( fields.num_elements , op );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<typename Scalar, class DeviceType >
+struct decomp_rotate
+{
+  typedef DeviceType execution_space ;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  static const int ElemNodeCount = Fields::ElemNodeCount ;
+
+  static const int K_F_XX = Fields::K_F_XX ;
+  static const int K_F_YY = Fields::K_F_YY ;
+  static const int K_F_ZZ = Fields::K_F_ZZ ;
+  static const int K_F_XY = Fields::K_F_XY ;
+  static const int K_F_YZ = Fields::K_F_YZ ;
+  static const int K_F_ZX = Fields::K_F_ZX ;
+  static const int K_F_YX = Fields::K_F_YX ;
+  static const int K_F_ZY = Fields::K_F_ZY ;
+  static const int K_F_XZ = Fields::K_F_XZ ;
+
+  static const int K_S_XX = Fields::K_S_XX ;
+  static const int K_S_YY = Fields::K_S_YY ;
+  static const int K_S_ZZ = Fields::K_S_ZZ ;
+  static const int K_S_XY = Fields::K_S_XY ;
+  static const int K_S_YZ = Fields::K_S_YZ ;
+  static const int K_S_ZX = Fields::K_S_ZX ;
+  static const int K_S_YX = Fields::K_S_YX ;
+  static const int K_S_ZY = Fields::K_S_ZY ;
+  static const int K_S_XZ = Fields::K_S_XZ ;
+
+  static const int K_V_XY = Fields::K_V_XY ;
+  static const int K_V_YZ = Fields::K_V_YZ ;
+  static const int K_V_ZX = Fields::K_V_ZX ;
+
+  // Global arrays used by this functor.
+
+  const typename Fields::elem_tensor_state_type     rotation ;
+  const typename Fields::elem_tensor_type           vel_grad ;
+  const typename Fields::elem_sym_tensor_type       stretch ;
+  const typename Fields::elem_sym_tensor_type       rot_stretch ;
+  const typename Fields::scalar_type                dt_value ;
+
+  const int  current_state;
+  const int  previous_state;
+
+  decomp_rotate( const Fields & mesh_fields ,
+                 const int arg_current_state,
+                 const int arg_previous_state)
+    : rotation(    mesh_fields.rotation )
+    , vel_grad(    mesh_fields.vel_grad )
+    , stretch(     mesh_fields.stretch )
+    , rot_stretch( mesh_fields.rot_stretch )
+    , dt_value(    mesh_fields.dt)
+    , current_state( arg_current_state)
+    , previous_state(arg_previous_state)
+    {}
+
+  static void apply( const Fields & mesh_fields ,
+                     const int arg_current_state ,
+                     const int arg_previous_state )
+  {
+    decomp_rotate op( mesh_fields , arg_current_state , arg_previous_state );
+    Kokkos::parallel_for( mesh_fields.num_elements , op );
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void additive_decomp(int ielem, Scalar * v_gr, Scalar * str_ten) const
+  {
+    //  In addition to calculating stretching_tensor,
+    //  use this as an opportunity to load global
+    //  variables into a local space
+
+    for ( int i = 0 ; i < 9 ; ++i ) {
+      v_gr[i] = vel_grad( ielem , i );
+    }
+
+    //
+    //  Symmetric part
+    //
+    str_ten[K_S_XX] = v_gr[K_F_XX];
+    str_ten[K_S_YY] = v_gr[K_F_YY];
+    str_ten[K_S_ZZ] = v_gr[K_F_ZZ];
+    str_ten[K_S_XY] = 0.5*(v_gr[K_F_XY] + v_gr[K_F_YX]);
+    str_ten[K_S_YZ] = 0.5*(v_gr[K_F_YZ] + v_gr[K_F_ZY]);
+    str_ten[K_S_ZX] = 0.5*(v_gr[K_F_ZX] + v_gr[K_F_XZ]);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void polar_decomp(int ielem, Scalar * v_gr, Scalar * str_ten, Scalar * str, Scalar * vort, Scalar * rot_old, Scalar * rot_new)const
+  {
+    const Scalar dt = *dt_value;
+    const Scalar dt_half = 0.5 * dt;
+
+    //  Skew Symmetric part
+    vort[K_V_XY] = 0.5*(v_gr[K_F_XY] - v_gr[K_F_YX]);
+    vort[K_V_YZ] = 0.5*(v_gr[K_F_YZ] - v_gr[K_F_ZY]);
+    vort[K_V_ZX] = 0.5*(v_gr[K_F_ZX] - v_gr[K_F_XZ]);
+
+    //   calculate the rates of rotation via gauss elimination.
+    for ( int i = 0 ; i < 6 ; ++i ) {
+      str[i] = stretch(ielem, i);
+    }
+
+    Scalar z1 = str_ten[K_S_XY] * str[K_S_ZX] -
+                str_ten[K_S_ZX] * str[K_S_XY] +
+                str_ten[K_S_YY] * str[K_S_YZ] -
+                str_ten[K_S_YZ] * str[K_S_YY] +
+                str_ten[K_S_YZ] * str[K_S_ZZ] -
+                str_ten[K_S_ZZ] * str[K_S_YZ];
+
+    Scalar z2 = str_ten[K_S_ZX] * str[K_S_XX] -
+                str_ten[K_S_XX] * str[K_S_ZX] +
+                str_ten[K_S_YZ] * str[K_S_XY] -
+                str_ten[K_S_XY] * str[K_S_YZ] +
+                str_ten[K_S_ZZ] * str[K_S_ZX] -
+                str_ten[K_S_ZX] * str[K_S_ZZ];
+
+    Scalar z3 = str_ten[K_S_XX] * str[K_S_XY] -
+                str_ten[K_S_XY] * str[K_S_XX] +
+                str_ten[K_S_XY] * str[K_S_YY] -
+                str_ten[K_S_YY] * str[K_S_XY] +
+                str_ten[K_S_ZX] * str[K_S_YZ] -
+                str_ten[K_S_YZ] * str[K_S_ZX];
+
+  //   forward elimination
+    const Scalar a1inv = 1.0 / (str[K_S_YY] + str[K_S_ZZ]);
+
+    const Scalar a4BYa1 = -1 * str[K_S_XY] * a1inv;
+
+    const Scalar a2inv = 1.0 / (str[K_S_ZZ] + str[K_S_XX] + str[K_S_XY] * a4BYa1);
+
+    const Scalar a5 =  -str[K_S_YZ] + str[K_S_ZX] * a4BYa1;
+
+    z2 -= z1 * a4BYa1;
+    Scalar a6BYa1 = -1 * str[K_S_ZX] * a1inv;
+    const Scalar a5BYa2 = a5 * a2inv;
+    z3 -= z1 * a6BYa1 - z2 * a5BYa2;
+
+  //   backward substitution -
+    z3 /= (str[K_S_XX] + str[K_S_YY] + str[K_S_ZX] * a6BYa1 + a5 * a5BYa2);
+    z2 = (z2 - a5 * z3) * a2inv;
+    z1 = (z1*a1inv - a6BYa1 * z3 -a4BYa1 * z2);
+
+  //   calculate rotation rates - recall that spin_rate is an asymmetric tensor,
+  //   so compute spin rate vector as dual of spin rate tensor,
+  //   i.e   w_i = e_ijk * spin_rate_jk
+    z1 += vort[K_V_YZ];
+    z2 += vort[K_V_ZX];
+    z3 += vort[K_V_XY];
+
+  //   update rotation tensor:
+  //  1) premultiply old rotation tensor to get right-hand side.
+
+    for ( int i = 0 ; i < 9 ; ++i ) {
+      rot_old[i] = rotation(ielem, i, previous_state);
+    }
+
+    Scalar r_XX = rot_old[K_F_XX] + dt_half*( z3 * rot_old[K_F_YX] - z2 * rot_old[K_F_ZX] );
+    Scalar r_YX = rot_old[K_F_YX] + dt_half*( z1 * rot_old[K_F_ZX] - z3 * rot_old[K_F_XX] );
+    Scalar r_ZX = rot_old[K_F_ZX] + dt_half*( z2 * rot_old[K_F_XX] - z1 * rot_old[K_F_YX] );
+    Scalar r_XY = rot_old[K_F_XY] + dt_half*( z3 * rot_old[K_F_YY] - z2 * rot_old[K_F_ZY] );
+    Scalar r_YY = rot_old[K_F_YY] + dt_half*( z1 * rot_old[K_F_ZY] - z3 * rot_old[K_F_XY] );
+    Scalar r_ZY = rot_old[K_F_ZY] + dt_half*( z2 * rot_old[K_F_XY] - z1 * rot_old[K_F_YY] );
+    Scalar r_XZ = rot_old[K_F_XZ] + dt_half*( z3 * rot_old[K_F_YZ] - z2 * rot_old[K_F_ZZ] );
+    Scalar r_YZ = rot_old[K_F_YZ] + dt_half*( z1 * rot_old[K_F_ZZ] - z3 * rot_old[K_F_XZ] );
+    Scalar r_ZZ = rot_old[K_F_ZZ] + dt_half*( z2 * rot_old[K_F_XZ] - z1 * rot_old[K_F_YZ] );
+
+
+  //  2) solve for new rotation tensor via gauss elimination.
+  //   forward elimination -
+    Scalar a12 = - dt_half * z3;
+    Scalar a13 =   dt_half * z2;
+    Scalar b32 = - dt_half * z1;
+    Scalar a22inv = 1.0 / (1.0 + a12 * a12);
+
+    Scalar a13a12 = a13*a12;
+    Scalar a23 = b32 + a13a12;
+    r_YX += r_XX * a12;
+    r_YY += r_XY * a12;
+    r_YZ += r_XZ * a12;
+
+
+    b32 = (b32 - a13a12) * a22inv;
+    r_ZX += r_XX * a13 + r_YX * b32;
+    r_ZY += r_XY * a13 + r_YY * b32;
+    r_ZZ += r_XZ * a13 + r_YZ * b32;
+
+
+  //   backward substitution -
+    const Scalar a33inv = 1.0 / (1.0 + a13 * a13 + a23 * b32);
+
+    rot_new[K_F_ZX] = r_ZX * a33inv;
+    rot_new[K_F_ZY] = r_ZY * a33inv;
+    rot_new[K_F_ZZ] = r_ZZ * a33inv;
+    rot_new[K_F_YX] = ( r_YX - rot_new[K_F_ZX] * a23 ) * a22inv;
+    rot_new[K_F_YY] = ( r_YY - rot_new[K_F_ZY] * a23 ) * a22inv;
+    rot_new[K_F_YZ] = ( r_YZ - rot_new[K_F_ZZ] * a23 ) * a22inv;
+    rot_new[K_F_XX] = r_XX - rot_new[K_F_ZX] * a13 - rot_new[K_F_YX] * a12;
+    rot_new[K_F_XY] = r_XY - rot_new[K_F_ZY] * a13 - rot_new[K_F_YY] * a12;
+    rot_new[K_F_XZ] = r_XZ - rot_new[K_F_ZZ] * a13 - rot_new[K_F_YZ] * a12;
+
+    for ( int i = 0 ; i < 9 ; ++i ) {
+      rotation(ielem, i, current_state) = rot_new[i] ;
+    }
+
+  //   update stretch tensor in the new configuration -
+    const Scalar a1 = str_ten[K_S_XY] + vort[K_V_XY];
+    const Scalar a2 = str_ten[K_S_YZ] + vort[K_V_YZ];
+    const Scalar a3 = str_ten[K_S_ZX] + vort[K_V_ZX];
+    const Scalar b1 = str_ten[K_S_ZX] - vort[K_V_ZX];
+    const Scalar b2 = str_ten[K_S_XY] - vort[K_V_XY];
+    const Scalar b3 = str_ten[K_S_YZ] - vort[K_V_YZ];
+
+    const Scalar s_XX = str[K_S_XX];
+    const Scalar s_YY = str[K_S_YY];
+    const Scalar s_ZZ = str[K_S_ZZ];
+    const Scalar s_XY = str[K_S_XY];
+    const Scalar s_YZ = str[K_S_YZ];
+    const Scalar s_ZX = str[K_S_ZX];
+
+    str[K_S_XX] += dt * (str_ten[K_S_XX] * s_XX + ( a1 + z3 ) * s_XY + ( b1 - z2 ) * s_ZX);
+    str[K_S_YY] += dt * (str_ten[K_S_YY] * s_YY + ( a2 + z1 ) * s_YZ + ( b2 - z3 ) * s_XY);
+    str[K_S_ZZ] += dt * (str_ten[K_S_ZZ] * s_ZZ + ( a3 + z2 ) * s_ZX + ( b3 - z1 ) * s_YZ);
+    str[K_S_XY] += dt * (str_ten[K_S_XX] * s_XY + ( a1 )      * s_YY + ( b1      ) * s_YZ - z3 * s_XX + z1 * s_ZX);
+    str[K_S_YZ] += dt * (str_ten[K_S_YY] * s_YZ + ( a2 )      * s_ZZ + ( b2      ) * s_ZX - z1 * s_YY + z2 * s_XY);
+    str[K_S_ZX] += dt * (str_ten[K_S_ZZ] * s_ZX + ( a3 )      * s_XX + ( b3      ) * s_XY - z2 * s_ZZ + z3 * s_YZ);
+
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void rotate_tensor(int ielem, Scalar * str_ten, Scalar * str, Scalar * rot_new)const {
+
+    Scalar t[9];
+    Scalar rot_str[6]; // Rotated stretch
+
+    t[0] = str_ten[K_S_XX]*rot_new[K_F_XX] +
+           str_ten[K_S_XY]*rot_new[K_F_YX] +
+           str_ten[K_S_XZ]*rot_new[K_F_ZX];
+
+    t[1] = str_ten[K_S_YX]*rot_new[K_F_XX] +
+           str_ten[K_S_YY]*rot_new[K_F_YX] +
+           str_ten[K_S_YZ]*rot_new[K_F_ZX];
+
+    t[2] = str_ten[K_S_ZX]*rot_new[K_F_XX] +
+           str_ten[K_S_ZY]*rot_new[K_F_YX] +
+           str_ten[K_S_ZZ]*rot_new[K_F_ZX];
+
+    t[3] = str_ten[K_S_XX]*rot_new[K_F_XY] +
+           str_ten[K_S_XY]*rot_new[K_F_YY] +
+           str_ten[K_S_XZ]*rot_new[K_F_ZY];
+
+    t[4] = str_ten[K_S_YX]*rot_new[K_F_XY] +
+           str_ten[K_S_YY]*rot_new[K_F_YY] +
+           str_ten[K_S_YZ]*rot_new[K_F_ZY];
+
+    t[5] = str_ten[K_S_ZX]*rot_new[K_F_XY] +
+           str_ten[K_S_ZY]*rot_new[K_F_YY] +
+           str_ten[K_S_ZZ]*rot_new[K_F_ZY];
+
+    t[6] = str_ten[K_S_XX]*rot_new[K_F_XZ] +
+           str_ten[K_S_XY]*rot_new[K_F_YZ] +
+           str_ten[K_S_XZ]*rot_new[K_F_ZZ];
+
+    t[7] = str_ten[K_S_YX]*rot_new[K_F_XZ] +
+           str_ten[K_S_YY]*rot_new[K_F_YZ] +
+           str_ten[K_S_YZ]*rot_new[K_F_ZZ];
+
+    t[8] = str_ten[K_S_ZX]*rot_new[K_F_XZ] +
+           str_ten[K_S_ZY]*rot_new[K_F_YZ] +
+           str_ten[K_S_ZZ]*rot_new[K_F_ZZ];
+
+
+    rot_str[ K_S_XX ] = rot_new[K_F_XX] * t[0] +
+                        rot_new[K_F_YX] * t[1] +
+                        rot_new[K_F_ZX] * t[2];
+    rot_str[ K_S_YY ] = rot_new[K_F_XY] * t[3] +
+                        rot_new[K_F_YY] * t[4] +
+                        rot_new[K_F_ZY] * t[5];
+    rot_str[ K_S_ZZ ] = rot_new[K_F_XZ] * t[6] +
+                        rot_new[K_F_YZ] * t[7] +
+                        rot_new[K_F_ZZ] * t[8];
+
+    rot_str[ K_S_XY ] = rot_new[K_F_XX] * t[3] +
+                        rot_new[K_F_YX] * t[4] +
+                        rot_new[K_F_ZX] * t[5];
+    rot_str[ K_S_YZ ] = rot_new[K_F_XY] * t[6] +
+                        rot_new[K_F_YY] * t[7] +
+                        rot_new[K_F_ZY] * t[8];
+    rot_str[ K_S_ZX ] = rot_new[K_F_XZ] * t[0] +
+                        rot_new[K_F_YZ] * t[1] +
+                        rot_new[K_F_ZZ] * t[2];
+
+    for ( int i = 0 ; i < 6 ; ++i ) {
+      rot_stretch(ielem, i) = rot_str[i] ;
+    }
+
+    for ( int i = 0 ; i < 6 ; ++i ) {
+      stretch(ielem, i) = str[i] ;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ielem )const {
+
+    //   Local scratch space to avoid multiple
+    //   accesses to global memory.
+    Scalar str_ten[6]; // Stretching tensor
+    Scalar str[6];     // Stretch
+    Scalar rot_old[9]; // Rotation old
+    Scalar rot_new[9]; // Rotation new
+    Scalar vort[3];    // Vorticity
+    Scalar v_gr[9];    // Velocity gradient
+
+    additive_decomp(ielem, v_gr, str_ten);
+
+    polar_decomp(ielem, v_gr, str_ten, str, vort, rot_old, rot_new);
+
+    rotate_tensor(ielem, str_ten, str, rot_new);
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<typename Scalar, class DeviceType >
+struct internal_force
+{
+  typedef DeviceType execution_space ;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  static const int ElemNodeCount = Fields::ElemNodeCount ;
+
+  static const int K_F_XX = Fields::K_F_XX ;
+  static const int K_F_YY = Fields::K_F_YY ;
+  static const int K_F_ZZ = Fields::K_F_ZZ ;
+  static const int K_F_XY = Fields::K_F_XY ;
+  static const int K_F_YZ = Fields::K_F_YZ ;
+  static const int K_F_ZX = Fields::K_F_ZX ;
+  static const int K_F_YX = Fields::K_F_YX ;
+  static const int K_F_ZY = Fields::K_F_ZY ;
+  static const int K_F_XZ = Fields::K_F_XZ ;
+
+  static const int K_S_XX = Fields::K_S_XX ;
+  static const int K_S_YY = Fields::K_S_YY ;
+  static const int K_S_ZZ = Fields::K_S_ZZ ;
+  static const int K_S_XY = Fields::K_S_XY ;
+  static const int K_S_YZ = Fields::K_S_YZ ;
+  static const int K_S_ZX = Fields::K_S_ZX ;
+  static const int K_S_YX = Fields::K_S_YX ;
+  static const int K_S_ZY = Fields::K_S_ZY ;
+  static const int K_S_XZ = Fields::K_S_XZ ;
+
+  //--------------------------------------------------------------------------
+  // Reduction:
+
+  typedef Scalar value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type &update) {
+    update = 1.0e32;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update,
+                    const volatile value_type & source )
+  {
+    update = update < source ? update : source;
+  }
+
+  // Final serial processing of reduction value:
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & result ) const
+  {
+    *prev_dt = *dt ;
+    *dt = result ;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // Global arrays used by this functor.
+
+  const typename Fields::elem_node_ids_type      elem_node_connectivity ;
+  const typename Fields::node_coords_type        model_coords ;
+  const typename Fields::scalar_type             dt ;
+  const typename Fields::scalar_type             prev_dt ;
+  const typename Fields::geom_state_array_type   displacement ;
+  const typename Fields::geom_state_array_type   velocity ;
+  const typename Fields::array_type              elem_mass ;
+  const typename Fields::array_type              internal_energy ;
+  const typename Fields::elem_sym_tensor_type    stress_new ;
+  const typename Fields::elem_node_geom_type     element_force ;
+  const typename Fields::elem_tensor_state_type  rotation ;
+  const typename Fields::elem_sym_tensor_type    rot_stretch ;
+
+  const Scalar     two_mu;
+  const Scalar     bulk_modulus;
+  const Scalar     lin_bulk_visc;
+  const Scalar     quad_bulk_visc;
+  const Scalar     user_dt;
+  const int        current_state;
+
+  internal_force( const Fields & mesh_fields,
+                  const Scalar arg_user_dt,
+                  const int arg_current_state )
+    : elem_node_connectivity( mesh_fields.elem_node_connectivity )
+    , model_coords(           mesh_fields.model_coords )
+    , dt(                     mesh_fields.dt )
+    , prev_dt(                mesh_fields.prev_dt )
+    , displacement(           mesh_fields.displacement )
+    , velocity(               mesh_fields.velocity )
+    , elem_mass(              mesh_fields.elem_mass )
+    , internal_energy(        mesh_fields.internal_energy )
+    , stress_new(             mesh_fields.stress_new )
+    , element_force(          mesh_fields.element_force )
+    , rotation(               mesh_fields.rotation )
+    , rot_stretch(            mesh_fields.rot_stretch )
+    , two_mu(                 mesh_fields.two_mu )
+    , bulk_modulus(           mesh_fields.bulk_modulus )
+    , lin_bulk_visc(          mesh_fields.lin_bulk_visc )
+    , quad_bulk_visc(         mesh_fields.quad_bulk_visc )
+    , user_dt(       arg_user_dt )
+    , current_state( arg_current_state )
+  {}
+
+  static void apply( const Fields & mesh_fields ,
+                     const Scalar arg_user_dt,
+                     const int arg_current_state )
+  {
+    internal_force  op_force( mesh_fields , arg_user_dt , arg_current_state );
+
+    Kokkos::parallel_reduce( mesh_fields.num_elements, op_force );
+  }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void rotate_tensor_backward(int ielem ,
+    const Scalar * const s_n ,
+    Scalar * const rot_stress )const
+  {
+    const int rot_state = current_state ; // 1 ;
+
+    //   t : temporary variables
+    //   s_n : stress_new in local memory space
+    //   r_n : rotation_new in local memory space
+    Scalar t[9], r_n[9];
+
+    r_n[0] = rotation(ielem, 0, rot_state );
+    r_n[1] = rotation(ielem, 1, rot_state );
+    r_n[2] = rotation(ielem, 2, rot_state );
+    r_n[3] = rotation(ielem, 3, rot_state );
+    r_n[4] = rotation(ielem, 4, rot_state );
+    r_n[5] = rotation(ielem, 5, rot_state );
+    r_n[6] = rotation(ielem, 6, rot_state );
+    r_n[7] = rotation(ielem, 7, rot_state );
+    r_n[8] = rotation(ielem, 8, rot_state );
+
+    t[0] = s_n[K_S_XX]*r_n[K_F_XX]+ s_n[K_S_XY]*r_n[K_F_XY]+ s_n[K_S_XZ]*r_n[K_F_XZ];
+    t[1] = s_n[K_S_YX]*r_n[K_F_XX]+ s_n[K_S_YY]*r_n[K_F_XY]+ s_n[K_S_YZ]*r_n[K_F_XZ];
+    t[2] = s_n[K_S_ZX]*r_n[K_F_XX]+ s_n[K_S_ZY]*r_n[K_F_XY]+ s_n[K_S_ZZ]*r_n[K_F_XZ];
+    t[3] = s_n[K_S_XX]*r_n[K_F_YX]+ s_n[K_S_XY]*r_n[K_F_YY]+ s_n[K_S_XZ]*r_n[K_F_YZ];
+    t[4] = s_n[K_S_YX]*r_n[K_F_YX]+ s_n[K_S_YY]*r_n[K_F_YY]+ s_n[K_S_YZ]*r_n[K_F_YZ];
+    t[5] = s_n[K_S_ZX]*r_n[K_F_YX]+ s_n[K_S_ZY]*r_n[K_F_YY]+ s_n[K_S_ZZ]*r_n[K_F_YZ];
+    t[6] = s_n[K_S_XX]*r_n[K_F_ZX]+ s_n[K_S_XY]*r_n[K_F_ZY]+ s_n[K_S_XZ]*r_n[K_F_ZZ];
+    t[7] = s_n[K_S_YX]*r_n[K_F_ZX]+ s_n[K_S_YY]*r_n[K_F_ZY]+ s_n[K_S_YZ]*r_n[K_F_ZZ];
+    t[8] = s_n[K_S_ZX]*r_n[K_F_ZX]+ s_n[K_S_ZY]*r_n[K_F_ZY]+ s_n[K_S_ZZ]*r_n[K_F_ZZ];
+
+    rot_stress[ K_S_XX ] = r_n[K_F_XX]*t[0] + r_n[K_F_XY]*t[1] + r_n[K_F_XZ]*t[2];
+    rot_stress[ K_S_YY ] = r_n[K_F_YX]*t[3] + r_n[K_F_YY]*t[4] + r_n[K_F_YZ]*t[5];
+    rot_stress[ K_S_ZZ ] = r_n[K_F_ZX]*t[6] + r_n[K_F_ZY]*t[7] + r_n[K_F_ZZ]*t[8];
+
+    rot_stress[ K_S_XY ] = r_n[K_F_XX]*t[3] + r_n[K_F_XY]*t[4] + r_n[K_F_XZ]*t[5];
+    rot_stress[ K_S_YZ ] = r_n[K_F_YX]*t[6] + r_n[K_F_YY]*t[7] + r_n[K_F_YZ]*t[8];
+    rot_stress[ K_S_ZX ] = r_n[K_F_ZX]*t[0] + r_n[K_F_ZY]*t[1] + r_n[K_F_ZZ]*t[2];
+  }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void comp_force(int ielem,
+     const Scalar * const vx ,
+     const Scalar * const vy ,
+     const Scalar * const vz ,
+     const Scalar * const grad_x ,
+     const Scalar * const grad_y ,
+     const Scalar * const grad_z ,
+     Scalar * total_stress12th ) const
+  {
+    Scalar internal_energy_inc = 0 ;
+
+    for(int inode = 0; inode < 8; ++inode) {
+
+      const Scalar fx =
+        total_stress12th[K_S_XX] * grad_x[inode] +
+        total_stress12th[K_S_XY] * grad_y[inode] +
+        total_stress12th[K_S_XZ] * grad_z[inode] ;
+
+      element_force(ielem, 0, inode) = fx ;
+
+      const Scalar fy =
+        total_stress12th[K_S_YX] * grad_x[inode] +
+        total_stress12th[K_S_YY] * grad_y[inode] +
+        total_stress12th[K_S_YZ] * grad_z[inode] ;
+
+      element_force(ielem, 1, inode) = fy ;
+
+      const Scalar fz =
+        total_stress12th[K_S_ZX] * grad_x[inode] +
+        total_stress12th[K_S_ZY] * grad_y[inode] +
+        total_stress12th[K_S_ZZ] * grad_z[inode] ;
+
+      element_force(ielem, 2, inode) = fz ;
+
+      internal_energy_inc +=
+        fx * vx[inode] +
+        fy * vy[inode] +
+        fz * vz[inode] ;
+    }
+
+    internal_energy(ielem) = internal_energy_inc ;
+  }
+
+  //----------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void get_stress(int ielem , Scalar * const s_n ) const
+    {
+      const int kxx = 0;
+      const int kyy = 1;
+      const int kzz = 2;
+      const int kxy = 3;
+      const int kyz = 4;
+      const int kzx = 5;
+
+      const Scalar e = (rot_stretch(ielem,kxx)+rot_stretch(ielem,kyy)+rot_stretch(ielem,kzz))/3.0;
+
+      s_n[kxx] = stress_new(ielem,kxx) += *dt * (two_mu * (rot_stretch(ielem,kxx)-e)+3*bulk_modulus*e);
+      s_n[kyy] = stress_new(ielem,kyy) += *dt * (two_mu * (rot_stretch(ielem,kyy)-e)+3*bulk_modulus*e);
+      s_n[kzz] = stress_new(ielem,kzz) += *dt * (two_mu * (rot_stretch(ielem,kzz)-e)+3*bulk_modulus*e);
+
+      s_n[kxy] = stress_new(ielem,kxy) += *dt * two_mu * rot_stretch(ielem,kxy);
+      s_n[kyz] = stress_new(ielem,kyz) += *dt * two_mu * rot_stretch(ielem,kyz);
+      s_n[kzx] = stress_new(ielem,kzx) += *dt * two_mu * rot_stretch(ielem,kzx);
+    }
+
+  //----------------------------------------------------------------------------
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ielem, value_type & update )const
+  {
+    const Scalar ONE12TH = 1.0 / 12.0 ;
+
+    Scalar x[8], y[8], z[8] ;
+    Scalar vx[8], vy[8], vz[8];
+    Scalar grad_x[8], grad_y[8], grad_z[8];
+
+    // Position and velocity:
+
+    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+      const int n = elem_node_connectivity(ielem,i);
+
+      x[i] = model_coords(n, 0) + displacement(n, 0, current_state) ;
+      y[i] = model_coords(n, 1) + displacement(n, 1, current_state) ;
+      z[i] = model_coords(n, 2) + displacement(n, 2, current_state) ;
+
+      vx[i] = velocity(n, 0, current_state);
+      vy[i] = velocity(n, 1, current_state);
+      vz[i] = velocity(n, 2, current_state);
+    }
+
+    // Gradient:
+
+    comp_grad<Scalar,execution_space>( x , y , z , grad_x , grad_y , grad_z );
+
+
+    const Scalar mid_vol = dot8<Scalar,execution_space>( x , grad_x );
+
+    const Scalar shr = two_mu ;
+    const Scalar dil = bulk_modulus + ((2.0*shr)/3.0);
+
+    const Scalar aspect = 6.0 * mid_vol /
+                          ( dot8<Scalar,execution_space>( grad_x , grad_x ) +
+                            dot8<Scalar,execution_space>( grad_y , grad_y ) +
+                            dot8<Scalar,execution_space>( grad_z , grad_z ) );
+
+    const Scalar dtrial = std::sqrt(elem_mass(ielem) * aspect / dil);
+    const Scalar traced = (rot_stretch(ielem, 0) + rot_stretch(ielem, 1) + rot_stretch(ielem, 2));
+
+    const Scalar eps = traced < 0 ? (lin_bulk_visc - quad_bulk_visc * traced * dtrial) : lin_bulk_visc ;
+
+    const Scalar bulkq = eps * dil * dtrial * traced;
+
+    Scalar cur_time_step = dtrial * ( std::sqrt( 1.0 + eps * eps) - eps);
+
+    // force fixed time step if input
+
+    cur_time_step = user_dt > 0 ? user_dt : cur_time_step;
+
+    update = update < cur_time_step ? update : cur_time_step;
+
+
+    Scalar s_n[ 6 ];
+
+    get_stress( ielem, s_n );
+
+    Scalar total_stress12th[6];
+
+    // Get rotated stress:
+
+    rotate_tensor_backward(ielem, s_n , total_stress12th );
+
+    total_stress12th[0] = ONE12TH*( total_stress12th[ 0 ] + bulkq );
+    total_stress12th[1] = ONE12TH*( total_stress12th[ 1 ] + bulkq );
+    total_stress12th[2] = ONE12TH*( total_stress12th[ 2 ] + bulkq );
+    total_stress12th[3] = ONE12TH*( total_stress12th[ 3 ] );
+    total_stress12th[4] = ONE12TH*( total_stress12th[ 4 ] );
+    total_stress12th[5] = ONE12TH*( total_stress12th[ 5 ] );
+
+    comp_force(ielem, vx, vy, vz,
+                      grad_x, grad_y, grad_z, total_stress12th);
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<typename Scalar, class DeviceType >
+struct nodal_step
+{
+  typedef DeviceType     execution_space ;
+  typedef typename execution_space::size_type  size_type;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  const typename Fields::scalar_type            dt ;
+  const typename Fields::scalar_type            prev_dt ;
+  const typename Fields::node_elem_ids_type     node_elem_connectivity ;
+  const typename Fields::node_coords_type       model_coords ;
+  const typename Fields::array_type             nodal_mass ;
+  const typename Fields::geom_state_array_type  displacement ;
+  const typename Fields::geom_state_array_type  velocity ;
+  const typename Fields::geom_array_type        acceleration ;
+  const typename Fields::geom_array_type        internal_force ;
+  const typename Fields::elem_node_geom_type    element_force ;
+
+  const Scalar   x_bc;
+  const int      current_state;
+  const int      next_state;
+
+
+  nodal_step( const Fields  & mesh_fields ,
+              const Scalar    arg_x_bc,
+              const int       arg_current_state,
+              const int       arg_next_state)
+   : dt(       mesh_fields.dt )
+   , prev_dt(  mesh_fields.prev_dt )
+   , node_elem_connectivity( mesh_fields.node_elem_connectivity )
+   , model_coords(   mesh_fields.model_coords )
+   , nodal_mass(     mesh_fields.nodal_mass )
+   , displacement(   mesh_fields.displacement )
+   , velocity(       mesh_fields.velocity )
+   , acceleration(   mesh_fields.acceleration )
+   , internal_force( mesh_fields.internal_force )
+   , element_force(  mesh_fields.element_force )
+   , x_bc(          arg_x_bc )
+   , current_state( arg_current_state )
+   , next_state(    arg_next_state )
+   {
+        //std::cout << "finish_step dt: " << dt << std::endl;
+        //std::cout << "finish_step prev_dt: " << prev_dt << std::endl;
+   }
+
+  static void apply( const Fields  & mesh_fields ,
+                     const Scalar    arg_x_bc ,
+                     const int       arg_current_state ,
+                     const int       arg_next_state )
+  {
+    nodal_step op( mesh_fields, arg_x_bc, arg_current_state, arg_next_state );
+
+    // Only update the owned nodes:
+
+    Kokkos::parallel_for( mesh_fields.num_nodes_owned , op );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int inode) const
+    {
+      // Getting count as per 'CSR-like' data structure
+      const int begin = node_elem_connectivity.row_map[inode];
+      const int end   = node_elem_connectivity.row_map[inode+1];
+
+      double local_force[] = {0.0, 0.0, 0.0};
+
+      // Gather-sum internal force from
+      // each element that a node is attached to.
+
+      for ( int i = begin; i < end ; ++i ){
+
+        //  node_elem_offset is a cumulative structure, so
+        //  node_elem_offset(inode) should be the index where
+        //  a particular row's elem_IDs begin
+        const int nelem = node_elem_connectivity.entries( i, 0);
+
+        //  find the row in an element's stiffness matrix
+        //  that corresponds to inode
+        const int elem_node_index = node_elem_connectivity.entries( i, 1);
+
+        local_force[0] += element_force(nelem, 0, elem_node_index);
+        local_force[1] += element_force(nelem, 1, elem_node_index);
+        local_force[2] += element_force(nelem, 2, elem_node_index);
+      }
+
+      internal_force(inode, 0) = local_force[0];
+      internal_force(inode, 1) = local_force[1];
+      internal_force(inode, 2) = local_force[2];
+
+      // Acceleration:
+
+      Scalar v_new[3];
+      Scalar a_current[3];
+
+      const Scalar tol = 1.0e-7;
+
+      // If not on the boundary then: a = F / m
+      if ( tol < fabs(model_coords(inode,0)-x_bc) ) {
+
+        const Scalar m = nodal_mass( inode );
+
+        acceleration(inode,0) = a_current[0] = -local_force[0] / m ;
+        acceleration(inode,1) = a_current[1] = -local_force[1] / m ;
+        acceleration(inode,2) = a_current[2] = -local_force[2] / m ;
+      }
+      else { //enforce fixed BC
+        acceleration(inode,0) = a_current[0] = 0;
+        acceleration(inode,1) = a_current[1] = 0;
+        acceleration(inode,2) = a_current[2] = 0;
+      }
+
+      // Central difference time integration:
+
+      const Scalar dt_disp = *dt ;
+      const Scalar dt_vel = ( *dt + *prev_dt ) / 2.0 ;
+
+      velocity(inode,0,next_state) = v_new[0] =
+        velocity(inode,0,current_state) + dt_vel * a_current[0];
+
+      velocity(inode,1,next_state) = v_new[1] =
+        velocity(inode,1,current_state) + dt_vel * a_current[1];
+
+      velocity(inode,2,next_state) = v_new[2] =
+        velocity(inode,2,current_state) + dt_vel * a_current[2];
+
+      displacement(inode,0,next_state) =
+        displacement(inode,0,current_state) + dt_disp * v_new[0];
+
+      displacement(inode,1,next_state) =
+        displacement(inode,1,current_state) + dt_disp * v_new[1];
+
+      displacement(inode,2,next_state) =
+        displacement(inode,2,current_state) + dt_disp * v_new[2];
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class DeviceType >
+struct pack_state
+{
+  typedef DeviceType     execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  typedef typename Fields::geom_state_array_type::value_type  value_type ;
+  typedef Kokkos::View< value_type* , execution_space >     buffer_type ;
+
+  static const unsigned value_count = 6 ;
+
+  const typename Fields::geom_state_array_type  displacement ;
+  const typename Fields::geom_state_array_type  velocity ;
+  const buffer_type  output ;
+  const size_type    inode_base ;
+  const size_type    state_next ;
+
+  pack_state( const buffer_type & arg_output ,
+              const Fields      & mesh_fields ,
+              const size_type     arg_begin ,
+              const size_type     arg_state )
+   : displacement( mesh_fields.displacement )
+   , velocity(     mesh_fields.velocity )
+   , output(       arg_output )
+   , inode_base(   arg_begin )
+   , state_next(   arg_state )
+   {}
+
+  static void apply( const buffer_type & arg_output ,
+                     const size_type     arg_begin ,
+                     const size_type     arg_count ,
+                     const Fields      & mesh_fields ,
+                     const size_type     arg_state )
+  {
+    pack_state op( arg_output , mesh_fields , arg_begin , arg_state );
+
+    Kokkos::parallel_for( arg_count , op );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  {
+    const size_type inode = inode_base + i ;
+
+    size_type j = i * value_count ;
+
+    output[j++] = displacement( inode , 0 , state_next );
+    output[j++] = displacement( inode , 1 , state_next );
+    output[j++] = displacement( inode , 2 , state_next );
+    output[j++] = velocity( inode , 0 , state_next );
+    output[j++] = velocity( inode , 1 , state_next );
+    output[j++] = velocity( inode , 2 , state_next );
+  }
+};
+
+template< typename Scalar , class DeviceType >
+struct unpack_state
+{
+  typedef DeviceType     execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  typedef typename Fields::geom_state_array_type::value_type  value_type ;
+  typedef Kokkos::View< value_type* , execution_space >     buffer_type ;
+
+  static const unsigned value_count = 6 ;
+
+  const typename Fields::geom_state_array_type  displacement ;
+  const typename Fields::geom_state_array_type  velocity ;
+  const buffer_type  input ;
+  const size_type    inode_base ;
+  const size_type    state_next ;
+
+  unpack_state( const buffer_type & arg_input ,
+                const Fields      & mesh_fields ,
+                const size_type     arg_begin ,
+                const size_type     arg_state )
+   : displacement( mesh_fields.displacement )
+   , velocity(     mesh_fields.velocity )
+   , input(        arg_input )
+   , inode_base(   arg_begin )
+   , state_next(   arg_state )
+   {}
+
+  static void apply( const Fields      & mesh_fields ,
+                     const size_type     arg_state ,
+                     const buffer_type & arg_input ,
+                     const size_type     arg_begin ,
+                     const size_type     arg_count )
+  {
+    unpack_state op( arg_input , mesh_fields , arg_begin , arg_state );
+
+    Kokkos::parallel_for( arg_count , op );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  {
+    const size_type inode = inode_base + i ;
+
+    size_type j = i * value_count ;
+
+    displacement( inode , 0 , state_next ) = input[j++] ;
+    displacement( inode , 1 , state_next ) = input[j++] ;
+    displacement( inode , 2 , state_next ) = input[j++] ;
+    velocity( inode , 0 , state_next ) = input[j++] ;
+    velocity( inode , 1 , state_next ) = input[j++] ;
+    velocity( inode , 2 , state_next ) = input[j++] ;
+  }
+};
+
+} /* namespace Explicit */
+
+#endif /* #ifndef KOKKOS_EXPLICITFUNCTORS_HPP */
+
+
diff --git a/packages/kokkos/example/multi_fem/FEMesh.hpp b/packages/kokkos/example/multi_fem/FEMesh.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e836cac43fb301f6afd45f985bf76e89180fb3f1
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/FEMesh.hpp
@@ -0,0 +1,86 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_FEMESH_HPP
+#define KOKKOS_FEMESH_HPP
+
+#include <utility>
+#include <limits>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_StaticCrsGraph.hpp>
+
+#include <ParallelComm.hpp>
+#include <ParallelDataMap.hpp>
+
+namespace HybridFEM {
+
+//----------------------------------------------------------------------------
+/** \brief  Finite element mesh fixture for hybrid parallel performance tests.
+ */
+template< typename CoordScalarType , unsigned ElemNodeCount , class Device >
+struct FEMesh {
+
+  typedef typename Device::size_type size_type ;
+
+  static const size_type element_node_count = ElemNodeCount ;
+
+  typedef Kokkos::View< CoordScalarType*[3] , Device >       node_coords_type ;
+  typedef Kokkos::View< size_type*[ElemNodeCount], Device >  elem_node_ids_type ;
+  typedef Kokkos::StaticCrsGraph< size_type[2] ,  Device >   node_elem_ids_type ;
+
+  node_coords_type         node_coords ;
+  elem_node_ids_type       elem_node_ids ;
+  node_elem_ids_type       node_elem_ids ;
+  Kokkos::ParallelDataMap  parallel_data_map ;
+};
+
+//----------------------------------------------------------------------------
+
+} /* namespace HybridFEM */
+
+#endif /* #ifndef KOKKOS_FEMESH_HPP */
+
diff --git a/packages/kokkos/example/multi_fem/HexElement.hpp b/packages/kokkos/example/multi_fem/HexElement.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c1c045a29bf7e0e3660b21a4cf33677337d8bc9b
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/HexElement.hpp
@@ -0,0 +1,268 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef ELEMENTHEX_HPP
+#define ELEMENTHEX_HPP
+
+namespace HybridFEM {
+
+template< unsigned NodeCount >
+class HexElement_TensorData ;
+
+template< unsigned NodeCount , class Device >
+class HexElement_TensorEval ;
+
+//----------------------------------------------------------------------------
+/** \brief  Evaluate Hex element on interval [-1,1]^3 */
+template<>
+class HexElement_TensorData< 8 > {
+public:
+
+  static const unsigned element_node_count    = 8 ;
+  static const unsigned spatial_dimension     = 3 ;
+  static const unsigned integration_count_1d  = 2 ;
+  static const unsigned function_count_1d     = 2 ;
+
+  float values_1d [ function_count_1d ][ integration_count_1d ];
+  float derivs_1d [ function_count_1d ][ integration_count_1d ];
+  float weights_1d[ integration_count_1d ];
+
+  unsigned char eval_map[ element_node_count ][4] ;
+
+  static float eval_value_1d( const unsigned jf , const float x )
+  {
+    return 0 == jf ? 0.5 * ( 1.0 - x ) : (
+           1 == jf ? 0.5 * ( 1.0 + x ) : 0 );
+  }
+
+  static float eval_deriv_1d( const unsigned jf , const float )
+  {
+    return 0 == jf ? -0.5 : (
+           1 == jf ?  0.5 : 0 );
+  }
+
+  HexElement_TensorData()
+  {
+    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
+      { { 0 , 0 , 0 },
+        { 1 , 0 , 0 },
+        { 1 , 1 , 0 },
+        { 0 , 1 , 0 },
+        { 0 , 0 , 1 },
+        { 1 , 0 , 1 },
+        { 1 , 1 , 1 },
+        { 0 , 1 , 1 } };
+
+    weights_1d[0] = 1 ;
+    weights_1d[1] = 1 ;
+
+    const float points_1d[ integration_count_1d ] =
+      { -0.577350269 , 0.577350269 };
+
+    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
+      eval_map[i][0] = tmp_map[i][0];
+      eval_map[i][1] = tmp_map[i][1];
+      eval_map[i][2] = tmp_map[i][2];
+    }
+
+    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
+    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
+      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
+      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
+    }}
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+class HexElement_TensorData< 27 > {
+public:
+
+  static const unsigned element_node_count    = 27 ;
+  static const unsigned spatial_dimension     = 3 ;
+  static const unsigned integration_count_1d  = 3 ;
+  static const unsigned function_count_1d     = 3 ;
+
+  float values_1d [ function_count_1d ][ integration_count_1d ];
+  float derivs_1d [ function_count_1d ][ integration_count_1d ];
+  float weights_1d[ integration_count_1d ];
+
+  unsigned char eval_map[ element_node_count ][4] ;
+
+  // sizeof(EvaluateElementHex) = 111 bytes =
+  //   sizeof(float) * 9 +
+  //   sizeof(float) * 9 +
+  //   sizeof(float) * 3 +
+  //   sizeof(char) * 27 
+
+  static float eval_value_1d( const unsigned jf , const float p )
+  {
+    return 0 == jf ? 0.5 * p * ( p - 1 ) : (
+           1 == jf ? 1.0 - p * p : (
+           2 == jf ? 0.5 * p * ( p + 1 ) : 0 ));
+  }
+
+  static float eval_deriv_1d( const unsigned jf , const float p )
+  {
+    return 0 == jf ? p - 0.5 : (
+           1 == jf ? -2.0 * p : (
+           2 == jf ? p + 0.5 : 0 ));
+  }
+
+  HexElement_TensorData()
+  {
+    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
+      { { 0 , 0 , 0 },
+        { 2 , 0 , 0 },
+        { 2 , 2 , 0 },
+        { 0 , 2 , 0 },
+        { 0 , 0 , 2 },
+        { 2 , 0 , 2 },
+        { 2 , 2 , 2 },
+        { 0 , 2 , 2 },
+        { 1 , 0 , 0 },
+        { 2 , 1 , 0 },
+        { 1 , 2 , 0 },
+        { 0 , 1 , 0 },
+        { 0 , 0 , 1 },
+        { 2 , 0 , 1 },
+        { 2 , 2 , 1 },
+        { 0 , 2 , 1 },
+        { 1 , 0 , 2 },
+        { 2 , 1 , 2 },
+        { 1 , 2 , 2 },
+        { 0 , 1 , 2 },
+        { 1 , 1 , 1 },
+        { 1 , 1 , 0 },
+        { 1 , 1 , 2 },
+        { 0 , 1 , 1 },
+        { 2 , 1 , 1 },
+        { 1 , 0 , 1 },
+        { 1 , 2 , 1 } };
+
+    // Interval [-1,1]
+
+    weights_1d[0] = 0.555555556 ;
+    weights_1d[1] = 0.888888889 ;
+    weights_1d[2] = 0.555555556 ;
+
+    const float points_1d[3] = { -0.774596669 ,
+                                  0.000000000 ,
+                                  0.774596669 };
+
+    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
+      eval_map[i][0] = tmp_map[i][0];
+      eval_map[i][1] = tmp_map[i][1];
+      eval_map[i][2] = tmp_map[i][2];
+    }
+
+    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
+    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
+      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
+      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
+    }}
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< unsigned NodeCount >
+class HexElement_Data {
+public:
+  static const unsigned spatial_dimension   = 3 ;
+  static const unsigned element_node_count  = NodeCount ;
+  static const unsigned integration_count   = NodeCount ;
+  static const unsigned function_count      = NodeCount ;
+
+  float weights[   integration_count ] ;
+  float values[    integration_count ][ function_count ];
+  float gradients[ integration_count ][ spatial_dimension ][ function_count ];
+
+  HexElement_Data()
+  {
+    HexElement_TensorData< NodeCount > tensor_data ;
+
+    for ( unsigned ip = 0 ; ip < integration_count ; ++ip ) {
+
+      const unsigned ipx = tensor_data.eval_map[ip][0] ;
+      const unsigned ipy = tensor_data.eval_map[ip][1] ;
+      const unsigned ipz = tensor_data.eval_map[ip][2] ;
+
+      weights[ip] = tensor_data.weights_1d[ ipx ] *
+                    tensor_data.weights_1d[ ipy ] *
+                    tensor_data.weights_1d[ ipz ] ;
+
+      for ( unsigned jf = 0 ; jf < function_count ; ++jf ) {
+
+        const unsigned jfx = tensor_data.eval_map[jf][0] ;
+        const unsigned jfy = tensor_data.eval_map[jf][1] ;
+        const unsigned jfz = tensor_data.eval_map[jf][2] ;
+
+        values[ip][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                         tensor_data.values_1d[ ipy ][ jfy ] *
+                         tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][0][jf] = tensor_data.derivs_1d[ ipx ][ jfx ] *
+                               tensor_data.values_1d[ ipy ][ jfy ] *
+                               tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][1][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                               tensor_data.derivs_1d[ ipy ][ jfy ] *
+                               tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][2][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                               tensor_data.values_1d[ ipy ][ jfy ] *
+                               tensor_data.derivs_1d[ ipz ][ jfz ] ;
+      }
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} /* namespace HybridFEM */
+
+#endif /* #ifndef ELEMENTHEX_HPP */
+
+
diff --git a/packages/kokkos/example/multi_fem/HexExplicitFunctions.hpp b/packages/kokkos/example/multi_fem/HexExplicitFunctions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..48f535f35b941c90339563040bbfc9491be0cf62
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/HexExplicitFunctions.hpp
@@ -0,0 +1,443 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HEXEXPLICITFUNCTIONS_HPP
+#define KOKKOS_HEXEXPLICITFUNCTIONS_HPP
+
+#include <cmath>
+
+namespace Explicit {
+
+struct Hex8Functions
+{
+  static const unsigned SpatialDim    = 3 ;
+  static const unsigned ElemNodeCount = 8 ;
+
+  // Indices for full 3x3 tensor:
+
+  static const unsigned K_F_XX = 0 ;
+  static const unsigned K_F_YY = 1 ;
+  static const unsigned K_F_ZZ = 2 ;
+  static const unsigned K_F_XY = 3 ;
+  static const unsigned K_F_YZ = 4 ;
+  static const unsigned K_F_ZX = 5 ;
+  static const unsigned K_F_YX = 6 ;
+  static const unsigned K_F_ZY = 7 ;
+  static const unsigned K_F_XZ = 8 ;
+  static const unsigned K_F_SIZE = 9 ;
+
+  //  Indexes into a 3 by 3 symmetric tensor stored as a length 6 vector
+
+  static const unsigned K_S_XX = 0 ;
+  static const unsigned K_S_YY = 1 ;
+  static const unsigned K_S_ZZ = 2 ;
+  static const unsigned K_S_XY = 3 ;
+  static const unsigned K_S_YZ = 4 ;
+  static const unsigned K_S_ZX = 5 ;
+  static const unsigned K_S_YX = 3 ;
+  static const unsigned K_S_ZY = 4 ;
+  static const unsigned K_S_XZ = 5 ;
+  static const unsigned K_S_SIZE = 6 ;
+
+  //  Indexes into a 3 by 3 skew symmetric tensor stored as a length 3 vector
+
+  static const unsigned K_V_XY = 0 ;
+  static const unsigned K_V_YZ = 1 ;
+  static const unsigned K_V_ZX = 2 ;
+  static const unsigned K_V_SIZE = 3 ;
+
+  //--------------------------------------------------------------------------
+
+  template< typename ScalarA , typename ScalarB >
+  KOKKOS_INLINE_FUNCTION static
+  double dot8( const ScalarA * const a , const ScalarB * const b )
+  { return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3] +
+           a[4] * b[4] + a[5] * b[5] + a[6] * b[6] + a[7] * b[7] ; }
+
+  //--------------------------------------------------------------------------
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  KOKKOS_INLINE_FUNCTION static
+  void grad( const ScalarPrecise x[] ,
+             const ScalarPrecise z[] ,
+                   ScalarCompact grad_y[] )
+  {
+    const ScalarCompact R42=(x[3] - x[1]);
+    const ScalarCompact R52=(x[4] - x[1]);
+    const ScalarCompact R54=(x[4] - x[3]);
+
+    const ScalarCompact R63=(x[5] - x[2]);
+    const ScalarCompact R83=(x[7] - x[2]);
+    const ScalarCompact R86=(x[7] - x[5]);
+
+    const ScalarCompact R31=(x[2] - x[0]);
+    const ScalarCompact R61=(x[5] - x[0]);
+    const ScalarCompact R74=(x[6] - x[3]);
+
+    const ScalarCompact R72=(x[6] - x[1]);
+    const ScalarCompact R75=(x[6] - x[4]);
+    const ScalarCompact R81=(x[7] - x[0]);
+
+    const ScalarCompact t1=(R63 + R54);
+    const ScalarCompact t2=(R61 + R74);
+    const ScalarCompact t3=(R72 + R81);
+
+    const ScalarCompact t4 =(R86 + R42);
+    const ScalarCompact t5 =(R83 + R52);
+    const ScalarCompact t6 =(R75 + R31);
+
+    //  Calculate Y gradient from X and Z data
+
+    grad_y[0] = (z[1] *  t1) - (z[2] * R42) - (z[3] *  t5)  + (z[4] *  t4) + (z[5] * R52) - (z[7] * R54);
+    grad_y[1] = (z[2] *  t2) + (z[3] * R31) - (z[0] *  t1)  - (z[5] *  t6) + (z[6] * R63) - (z[4] * R61);
+    grad_y[2] = (z[3] *  t3) + (z[0] * R42) - (z[1] *  t2)  - (z[6] *  t4) + (z[7] * R74) - (z[5] * R72);
+    grad_y[3] = (z[0] *  t5) - (z[1] * R31) - (z[2] *  t3)  + (z[7] *  t6) + (z[4] * R81) - (z[6] * R83);
+    grad_y[4] = (z[5] *  t3) + (z[6] * R86) - (z[7] *  t2)  - (z[0] *  t4) - (z[3] * R81) + (z[1] * R61);
+    grad_y[5] = (z[6] *  t5) - (z[4] *  t3)  - (z[7] * R75) + (z[1] *  t6) - (z[0] * R52) + (z[2] * R72);
+    grad_y[6] = (z[7] *  t1) - (z[5] *  t5)  - (z[4] * R86) + (z[2] *  t4) - (z[1] * R63) + (z[3] * R83);
+    grad_y[7] = (z[4] *  t2) - (z[6] *  t1)  + (z[5] * R75) - (z[3] *  t6) - (z[2] * R74) + (z[0] * R54);
+  }
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  static KOKKOS_INLINE_FUNCTION
+  void grad( const ScalarPrecise x[] ,
+             const ScalarPrecise y[] ,
+             const ScalarPrecise z[] ,
+                   ScalarCompact grad_x[] ,
+                   ScalarCompact grad_y[] ,
+                   ScalarCompact grad_z[] )
+  {
+    grad( x , z , grad_y );
+    grad( z , y , grad_x );
+    grad( y , x , grad_z );
+  }
+
+  //--------------------------------------------------------------------------
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  KOKKOS_INLINE_FUNCTION static
+  void polar_decomp( const float dt ,
+                     const ScalarCompact v_gr[] ,
+                           ScalarPrecise stretch[] /* INOUT */ ,
+                           ScalarCompact str_ten[] /* OUT */ ,
+                           ScalarCompact rot[]     /* OUT */ )
+  {
+    const float dt_half = 0.5 * dt;
+
+    ScalarCompact vort[ K_V_SIZE ];  // Vorticity
+
+    //  Symmetric part
+    str_ten[K_S_XX] = v_gr[K_F_XX];
+    str_ten[K_S_YY] = v_gr[K_F_YY];
+    str_ten[K_S_ZZ] = v_gr[K_F_ZZ];
+    str_ten[K_S_XY] = 0.5 * ( v_gr[K_F_XY] + v_gr[K_F_YX] );
+    str_ten[K_S_YZ] = 0.5 * ( v_gr[K_F_YZ] + v_gr[K_F_ZY] );
+    str_ten[K_S_ZX] = 0.5 * ( v_gr[K_F_ZX] + v_gr[K_F_XZ] );
+
+    //  Skew Symmetric part
+    vort[K_V_XY] = 0.5 * ( v_gr[K_F_XY] - v_gr[K_F_YX] );
+    vort[K_V_YZ] = 0.5 * ( v_gr[K_F_YZ] - v_gr[K_F_ZY] );
+    vort[K_V_ZX] = 0.5 * ( v_gr[K_F_ZX] - v_gr[K_F_XZ] );
+
+    //   calculate the rates of rotation via gauss elimination.
+
+    ScalarCompact z1 = str_ten[K_S_XY] * stretch[K_S_ZX] -
+                       str_ten[K_S_ZX] * stretch[K_S_XY] +
+                       str_ten[K_S_YY] * stretch[K_S_YZ] -
+                       str_ten[K_S_YZ] * stretch[K_S_YY] +
+                       str_ten[K_S_YZ] * stretch[K_S_ZZ] -
+                       str_ten[K_S_ZZ] * stretch[K_S_YZ];
+
+    ScalarCompact z2 = str_ten[K_S_ZX] * stretch[K_S_XX] -
+                       str_ten[K_S_XX] * stretch[K_S_ZX] +
+                       str_ten[K_S_YZ] * stretch[K_S_XY] -
+                       str_ten[K_S_XY] * stretch[K_S_YZ] +
+                       str_ten[K_S_ZZ] * stretch[K_S_ZX] -
+                       str_ten[K_S_ZX] * stretch[K_S_ZZ];
+
+    ScalarCompact z3 = str_ten[K_S_XX] * stretch[K_S_XY] -
+                       str_ten[K_S_XY] * stretch[K_S_XX] +
+                       str_ten[K_S_XY] * stretch[K_S_YY] -
+                       str_ten[K_S_YY] * stretch[K_S_XY] +
+                       str_ten[K_S_ZX] * stretch[K_S_YZ] -
+                       str_ten[K_S_YZ] * stretch[K_S_ZX];
+
+    {
+      //   forward elimination
+
+      const ScalarCompact a1inv  = 1.0 / (stretch[K_S_YY] + stretch[K_S_ZZ]);
+      const ScalarCompact a4BYa1 = -1 * stretch[K_S_XY] * a1inv;
+      const ScalarCompact a2inv  = 1.0 / (stretch[K_S_ZZ] + stretch[K_S_XX] + stretch[K_S_XY] * a4BYa1);
+
+     const ScalarCompact a5 =  -stretch[K_S_YZ] + stretch[K_S_ZX] * a4BYa1;
+
+      z2 -= z1 * a4BYa1;
+      const ScalarCompact a6BYa1 = -1 * stretch[K_S_ZX] * a1inv;
+      const ScalarCompact a5BYa2 = a5 * a2inv;
+      z3 -= z1 * a6BYa1 - z2 * a5BYa2;
+
+      //   backward substitution -
+
+      z3 /= (stretch[K_S_XX] + stretch[K_S_YY] + stretch[K_S_ZX] * a6BYa1 + a5 * a5BYa2);
+      z2 = (z2 - a5 * z3) * a2inv;
+      z1 = (z1*a1inv - a6BYa1 * z3 -a4BYa1 * z2);
+    }
+
+    //   calculate rotation rates - recall that spin_rate is an asymmetric tensor,
+    //   so compute spin rate vector as dual of spin rate tensor,
+    //   i.e   w_i = e_ijk * spin_rate_jk
+
+    z1 += vort[K_V_YZ];
+    z2 += vort[K_V_ZX];
+    z3 += vort[K_V_XY];
+
+    {
+      //   update rotation tensor:
+      //  1) premultiply old rotation tensor to get right-hand side.
+
+      ScalarCompact r_XX = rot[K_F_XX] + dt_half*( z3 * rot[K_F_YX] - z2 * rot[K_F_ZX] );
+      ScalarCompact r_YX = rot[K_F_YX] + dt_half*( z1 * rot[K_F_ZX] - z3 * rot[K_F_XX] );
+      ScalarCompact r_ZX = rot[K_F_ZX] + dt_half*( z2 * rot[K_F_XX] - z1 * rot[K_F_YX] );
+      ScalarCompact r_XY = rot[K_F_XY] + dt_half*( z3 * rot[K_F_YY] - z2 * rot[K_F_ZY] );
+      ScalarCompact r_YY = rot[K_F_YY] + dt_half*( z1 * rot[K_F_ZY] - z3 * rot[K_F_XY] );
+      ScalarCompact r_ZY = rot[K_F_ZY] + dt_half*( z2 * rot[K_F_XY] - z1 * rot[K_F_YY] );
+      ScalarCompact r_XZ = rot[K_F_XZ] + dt_half*( z3 * rot[K_F_YZ] - z2 * rot[K_F_ZZ] );
+      ScalarCompact r_YZ = rot[K_F_YZ] + dt_half*( z1 * rot[K_F_ZZ] - z3 * rot[K_F_XZ] );
+      ScalarCompact r_ZZ = rot[K_F_ZZ] + dt_half*( z2 * rot[K_F_XZ] - z1 * rot[K_F_YZ] );
+
+
+      //  2) solve for new rotation tensor via gauss elimination.
+      //   forward elimination -
+
+      const ScalarCompact a12 = - dt_half * z3;
+      const ScalarCompact a13 =   dt_half * z2;
+            ScalarCompact b32 = - dt_half * z1;
+      const ScalarCompact a22inv = 1.0 / (1.0 + a12 * a12);
+
+      const ScalarCompact a13a12 = a13*a12;
+      const ScalarCompact a23 = b32 + a13a12;
+
+      r_YX += r_XX * a12;
+      r_YY += r_XY * a12;
+      r_YZ += r_XZ * a12;
+
+      b32 = (b32 - a13a12) * a22inv;
+
+      r_ZX += r_XX * a13 + r_YX * b32;
+      r_ZY += r_XY * a13 + r_YY * b32;
+      r_ZZ += r_XZ * a13 + r_YZ * b32;
+
+      //   backward substitution -
+
+      const ScalarCompact a33inv = 1.0 / (1.0 + a13 * a13 + a23 * b32);
+
+      rot[K_F_ZX] = r_ZX * a33inv;
+      rot[K_F_ZY] = r_ZY * a33inv;
+      rot[K_F_ZZ] = r_ZZ * a33inv;
+      rot[K_F_YX] = ( r_YX - rot[K_F_ZX] * a23 ) * a22inv;
+      rot[K_F_YY] = ( r_YY - rot[K_F_ZY] * a23 ) * a22inv;
+      rot[K_F_YZ] = ( r_YZ - rot[K_F_ZZ] * a23 ) * a22inv;
+      rot[K_F_XX] = r_XX - rot[K_F_ZX] * a13 - rot[K_F_YX] * a12;
+      rot[K_F_XY] = r_XY - rot[K_F_ZY] * a13 - rot[K_F_YY] * a12;
+      rot[K_F_XZ] = r_XZ - rot[K_F_ZZ] * a13 - rot[K_F_YZ] * a12;
+    }
+
+    //   update stretch tensor in the new configuration -
+
+    const ScalarCompact a1 = str_ten[K_S_XY] + vort[K_V_XY];
+    const ScalarCompact a2 = str_ten[K_S_YZ] + vort[K_V_YZ];
+    const ScalarCompact a3 = str_ten[K_S_ZX] + vort[K_V_ZX];
+    const ScalarCompact b1 = str_ten[K_S_ZX] - vort[K_V_ZX];
+    const ScalarCompact b2 = str_ten[K_S_XY] - vort[K_V_XY];
+    const ScalarCompact b3 = str_ten[K_S_YZ] - vort[K_V_YZ];
+
+    const ScalarCompact s_XX = stretch[K_S_XX];
+    const ScalarCompact s_YY = stretch[K_S_YY];
+    const ScalarCompact s_ZZ = stretch[K_S_ZZ];
+    const ScalarCompact s_XY = stretch[K_S_XY];
+    const ScalarCompact s_YZ = stretch[K_S_YZ];
+    const ScalarCompact s_ZX = stretch[K_S_ZX];
+
+    stretch[K_S_XX] += dt * (str_ten[K_S_XX] * s_XX + ( a1 + z3 ) * s_XY + ( b1 - z2 ) * s_ZX);
+    stretch[K_S_YY] += dt * (str_ten[K_S_YY] * s_YY + ( a2 + z1 ) * s_YZ + ( b2 - z3 ) * s_XY);
+    stretch[K_S_ZZ] += dt * (str_ten[K_S_ZZ] * s_ZZ + ( a3 + z2 ) * s_ZX + ( b3 - z1 ) * s_YZ);
+    stretch[K_S_XY] += dt * (str_ten[K_S_XX] * s_XY + ( a1 )      * s_YY + ( b1      ) * s_YZ - z3 * s_XX + z1 * s_ZX);
+    stretch[K_S_YZ] += dt * (str_ten[K_S_YY] * s_YZ + ( a2 )      * s_ZZ + ( b2      ) * s_ZX - z1 * s_YY + z2 * s_XY);
+    stretch[K_S_ZX] += dt * (str_ten[K_S_ZZ] * s_ZX + ( a3 )      * s_XX + ( b3      ) * s_XY - z2 * s_ZZ + z3 * s_YZ);
+  }
+
+  //--------------------------------------------------------------------------
+
+  template< typename ScalarCompact >
+  static KOKKOS_INLINE_FUNCTION
+  void rotate_tensor( const ScalarCompact str_ten[] ,
+                      const ScalarCompact rot[] ,
+                            ScalarCompact rot_str[] )
+  {
+    ScalarCompact t[9];
+
+    t[0] = str_ten[K_S_XX]*rot[K_F_XX] + str_ten[K_S_XY]*rot[K_F_YX] + str_ten[K_S_XZ]*rot[K_F_ZX];
+    t[1] = str_ten[K_S_YX]*rot[K_F_XX] + str_ten[K_S_YY]*rot[K_F_YX] + str_ten[K_S_YZ]*rot[K_F_ZX];
+    t[2] = str_ten[K_S_ZX]*rot[K_F_XX] + str_ten[K_S_ZY]*rot[K_F_YX] + str_ten[K_S_ZZ]*rot[K_F_ZX];
+
+    t[3] = str_ten[K_S_XX]*rot[K_F_XY] + str_ten[K_S_XY]*rot[K_F_YY] + str_ten[K_S_XZ]*rot[K_F_ZY];
+    t[4] = str_ten[K_S_YX]*rot[K_F_XY] + str_ten[K_S_YY]*rot[K_F_YY] + str_ten[K_S_YZ]*rot[K_F_ZY];
+    t[5] = str_ten[K_S_ZX]*rot[K_F_XY] + str_ten[K_S_ZY]*rot[K_F_YY] + str_ten[K_S_ZZ]*rot[K_F_ZY];
+
+    t[6] = str_ten[K_S_XX]*rot[K_F_XZ] + str_ten[K_S_XY]*rot[K_F_YZ] + str_ten[K_S_XZ]*rot[K_F_ZZ];
+    t[7] = str_ten[K_S_YX]*rot[K_F_XZ] + str_ten[K_S_YY]*rot[K_F_YZ] + str_ten[K_S_YZ]*rot[K_F_ZZ];
+    t[8] = str_ten[K_S_ZX]*rot[K_F_XZ] + str_ten[K_S_ZY]*rot[K_F_YZ] + str_ten[K_S_ZZ]*rot[K_F_ZZ];
+
+
+    rot_str[ K_S_XX ] = rot[K_F_XX] * t[0] + rot[K_F_YX] * t[1] + rot[K_F_ZX] * t[2];
+    rot_str[ K_S_YY ] = rot[K_F_XY] * t[3] + rot[K_F_YY] * t[4] + rot[K_F_ZY] * t[5];
+    rot_str[ K_S_ZZ ] = rot[K_F_XZ] * t[6] + rot[K_F_YZ] * t[7] + rot[K_F_ZZ] * t[8];
+
+    rot_str[ K_S_XY ] = rot[K_F_XX] * t[3] + rot[K_F_YX] * t[4] + rot[K_F_ZX] * t[5];
+    rot_str[ K_S_YZ ] = rot[K_F_XY] * t[6] + rot[K_F_YY] * t[7] + rot[K_F_ZY] * t[8];
+    rot_str[ K_S_ZX ] = rot[K_F_XZ] * t[0] + rot[K_F_YZ] * t[1] + rot[K_F_ZZ] * t[2];
+  }
+
+  //--------------------------------------------------------------------------
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  static KOKKOS_INLINE_FUNCTION
+  void rotate_tensor_backward( const ScalarPrecise stress[] ,
+                               const ScalarCompact rot[] ,
+                                     ScalarCompact rot_stress[] )
+  {
+    ScalarCompact t[9] ;
+
+    t[0] = stress[K_S_XX]*rot[K_F_XX]+ stress[K_S_XY]*rot[K_F_XY]+ stress[K_S_XZ]*rot[K_F_XZ];
+    t[1] = stress[K_S_YX]*rot[K_F_XX]+ stress[K_S_YY]*rot[K_F_XY]+ stress[K_S_YZ]*rot[K_F_XZ];
+    t[2] = stress[K_S_ZX]*rot[K_F_XX]+ stress[K_S_ZY]*rot[K_F_XY]+ stress[K_S_ZZ]*rot[K_F_XZ];
+    t[3] = stress[K_S_XX]*rot[K_F_YX]+ stress[K_S_XY]*rot[K_F_YY]+ stress[K_S_XZ]*rot[K_F_YZ];
+    t[4] = stress[K_S_YX]*rot[K_F_YX]+ stress[K_S_YY]*rot[K_F_YY]+ stress[K_S_YZ]*rot[K_F_YZ];
+    t[5] = stress[K_S_ZX]*rot[K_F_YX]+ stress[K_S_ZY]*rot[K_F_YY]+ stress[K_S_ZZ]*rot[K_F_YZ];
+    t[6] = stress[K_S_XX]*rot[K_F_ZX]+ stress[K_S_XY]*rot[K_F_ZY]+ stress[K_S_XZ]*rot[K_F_ZZ];
+    t[7] = stress[K_S_YX]*rot[K_F_ZX]+ stress[K_S_YY]*rot[K_F_ZY]+ stress[K_S_YZ]*rot[K_F_ZZ];
+    t[8] = stress[K_S_ZX]*rot[K_F_ZX]+ stress[K_S_ZY]*rot[K_F_ZY]+ stress[K_S_ZZ]*rot[K_F_ZZ];
+
+    rot_stress[ K_S_XX ] = rot[K_F_XX]*t[0] + rot[K_F_XY]*t[1] + rot[K_F_XZ]*t[2];
+    rot_stress[ K_S_YY ] = rot[K_F_YX]*t[3] + rot[K_F_YY]*t[4] + rot[K_F_YZ]*t[5];
+    rot_stress[ K_S_ZZ ] = rot[K_F_ZX]*t[6] + rot[K_F_ZY]*t[7] + rot[K_F_ZZ]*t[8];
+
+    rot_stress[ K_S_XY ] = rot[K_F_XX]*t[3] + rot[K_F_XY]*t[4] + rot[K_F_XZ]*t[5];
+    rot_stress[ K_S_YZ ] = rot[K_F_YX]*t[6] + rot[K_F_YY]*t[7] + rot[K_F_YZ]*t[8];
+    rot_stress[ K_S_ZX ] = rot[K_F_ZX]*t[0] + rot[K_F_ZY]*t[1] + rot[K_F_ZZ]*t[2];
+  }
+
+  //--------------------------------------------------------------------------
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  KOKKOS_INLINE_FUNCTION static
+  void update_stress( const float dt ,
+                      const float two_mu ,
+                      const float bulk_modulus ,
+                      const ScalarCompact rot_str[] ,
+                            ScalarPrecise stress[] )
+  {
+    const ScalarCompact e = rot_str[ K_S_XX ] + rot_str[ K_S_YY ] + rot_str[ K_S_ZZ ] ;
+    const ScalarCompact eb = e * bulk_modulus ;
+    const ScalarCompact e3 = e / 3.0 ;
+
+    stress[K_S_XX] += dt * ( two_mu * ( rot_str[K_S_XX] - e3 ) + eb );
+    stress[K_S_YY] += dt * ( two_mu * ( rot_str[K_S_YY] - e3 ) + eb );
+    stress[K_S_ZZ] += dt * ( two_mu * ( rot_str[K_S_ZZ] - e3 ) + eb );
+
+    stress[K_S_XY] += dt * two_mu * rot_str[K_S_XY];
+    stress[K_S_YZ] += dt * two_mu * rot_str[K_S_YZ];
+    stress[K_S_ZX] += dt * two_mu * rot_str[K_S_ZX];
+  }
+
+  //--------------------------------------------------------------------------
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  static KOKKOS_INLINE_FUNCTION
+  void comp_force( const ScalarPrecise vx[] ,
+                   const ScalarPrecise vy[] ,
+                   const ScalarPrecise vz[] ,
+                   const ScalarCompact grad_x[] ,
+                   const ScalarCompact grad_y[] ,
+                   const ScalarCompact grad_z[] ,
+                   const ScalarCompact total_stress12th[] ,
+                         ScalarCompact force[][ SpatialDim ] ,
+                         ScalarCompact & energy )
+  {
+    ScalarPrecise internal_energy = 0 ;
+
+    for ( unsigned inode = 0; inode < ElemNodeCount ; ++inode ) {
+
+      force[inode][0] = total_stress12th[K_S_XX] * grad_x[inode] +
+                        total_stress12th[K_S_XY] * grad_y[inode] +
+                        total_stress12th[K_S_XZ] * grad_z[inode] ;
+
+      force[inode][1] = total_stress12th[K_S_YX] * grad_x[inode] +
+                        total_stress12th[K_S_YY] * grad_y[inode] +
+                        total_stress12th[K_S_YZ] * grad_z[inode] ;
+
+      force[inode][2] = total_stress12th[K_S_ZX] * grad_x[inode] +
+                        total_stress12th[K_S_ZY] * grad_y[inode] +
+                        total_stress12th[K_S_ZZ] * grad_z[inode] ;
+
+      internal_energy += force[inode][0] * vx[inode] +
+                         force[inode][1] * vy[inode] +
+                         force[inode][2] * vz[inode] ;
+    }
+
+    energy = internal_energy ;
+  }
+
+  //--------------------------------------------------------------------------
+};
+
+} // namespace Explicit
+
+#endif /* #ifndef KOKKOS_HEXEXPLICITFUNCTIONS_HPP */
+
diff --git a/packages/kokkos/example/multi_fem/Implicit.hpp b/packages/kokkos/example/multi_fem/Implicit.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bc8659942e2f969f4a63c9a2b718ba1aea984ca4
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/Implicit.hpp
@@ -0,0 +1,341 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef HYBRIDFEM_IMPLICIT_HPP
+#define HYBRIDFEM_IMPLICIT_HPP
+
+#include <utility>
+#include <iostream>
+#include <iomanip>
+
+#include <Kokkos_Core.hpp>
+#include <SparseLinearSystem.hpp>
+#include <SparseLinearSystemFill.hpp>
+#include <ImplicitFunctors.hpp>
+#include <FEMesh.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace HybridFEM {
+namespace Implicit {
+
+struct PerformanceData {
+  double mesh_time ;
+  double graph_time ;
+  double elem_time ;
+  double matrix_gather_fill_time ;
+  double matrix_boundary_condition_time ;
+  double cg_iteration_time ;
+
+  PerformanceData()
+    : mesh_time(0)
+    , graph_time(0)
+    , elem_time(0)
+    , matrix_gather_fill_time(0)
+    , matrix_boundary_condition_time(0)
+    , cg_iteration_time(0)
+    {}
+
+  void best( const PerformanceData & rhs )
+  {
+    mesh_time = std::min( mesh_time , rhs.mesh_time );
+    graph_time = std::min( graph_time , rhs.graph_time );
+    elem_time = std::min( elem_time , rhs.elem_time );
+    matrix_gather_fill_time = std::min( matrix_gather_fill_time , rhs.matrix_gather_fill_time );
+    matrix_boundary_condition_time = std::min( matrix_boundary_condition_time , rhs.matrix_boundary_condition_time );
+    cg_iteration_time = std::min( cg_iteration_time , rhs.cg_iteration_time );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class FixtureType >
+PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
+                     const int , // global_max_x ,
+                     const int , // global_max_y ,
+                     const int global_max_z ,
+                     const bool print_sample )
+{
+  typedef Scalar                              scalar_type ;
+  typedef FixtureType                         fixture_type ;
+  typedef typename fixture_type::execution_space  execution_space;
+  //typedef typename execution_space::size_type     size_type ; // unused
+
+  typedef typename fixture_type::FEMeshType mesh_type ;
+  typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;
+
+  enum { ElementNodeCount = fixture_type::element_node_count };
+
+  const comm::Machine machine = mesh.parallel_data_map.machine ;
+
+  const size_t element_count = mesh.elem_node_ids.dimension_0();
+
+  const size_t iteration_limit = 200 ;
+  const double residual_tolerance = 1e-14 ;
+
+  size_t iteration_count = 0 ;
+  double residual_norm = 0 ;
+
+  PerformanceData perf_data ;
+
+  //------------------------------------
+  // Sparse linear system types:
+
+  typedef Kokkos::View< scalar_type* , execution_space >   vector_type ;
+  typedef Kokkos::CrsMatrix< scalar_type , execution_space >     matrix_type ;
+  typedef typename matrix_type::graph_type         matrix_graph_type ;
+  typedef typename matrix_type::coefficients_type  matrix_coefficients_type ;
+
+  typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;
+
+  //------------------------------------
+  // Problem setup types:
+
+  typedef ElementComputation< scalar_type , scalar_type , execution_space > ElementFunctor ;
+  typedef DirichletBoundary< scalar_type , scalar_type , execution_space > BoundaryFunctor ;
+
+  typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
+  typedef typename ElementFunctor::elem_vectors_type  elem_vectors_type ;
+
+  typedef GatherFill< matrix_type ,
+                      mesh_type ,
+                      elem_matrices_type ,
+                      elem_vectors_type > GatherFillFunctor ;
+
+  //------------------------------------
+
+  const scalar_type elem_coeff_K = 2 ;
+  const scalar_type elem_load_Q  = 1 ;
+
+  matrix_type linsys_matrix ;
+  vector_type linsys_rhs ;
+  vector_type linsys_solution ;
+
+  typename graph_factory::element_map_type element_map ;
+
+  Kokkos::Timer wall_clock ;
+
+  //------------------------------------
+  // Generate sparse matrix graph and element->graph map.
+
+  graph_factory::create( mesh , linsys_matrix.graph , element_map );
+
+  execution_space::fence();
+  perf_data.graph_time = comm::max( machine , wall_clock.seconds() );
+
+  //------------------------------------
+  // Allocate linear system coefficients and rhs:
+
+  const size_t local_owned_length =
+    linsys_matrix.graph.row_map.dimension_0() - 1 ;
+
+  linsys_matrix.coefficients =
+    matrix_coefficients_type( "coeff" , linsys_matrix.graph.entries.dimension_0() );
+
+  linsys_rhs      = vector_type( "rhs" , local_owned_length );
+  linsys_solution = vector_type( "solution" , local_owned_length );
+
+  //------------------------------------
+  // Fill linear system
+  {
+    elem_matrices_type elem_matrices ;
+    elem_vectors_type  elem_vectors ;
+
+    if ( element_count ) {
+      elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
+      elem_vectors  = elem_vectors_type ( std::string("elem_vectors"), element_count );
+    }
+
+    //------------------------------------
+    // Compute element matrices and vectors:
+
+    wall_clock.reset();
+
+    ElementFunctor::apply( mesh ,
+                           elem_matrices , elem_vectors ,
+                           elem_coeff_K , elem_load_Q );
+
+    execution_space::fence();
+    perf_data.elem_time = comm::max( machine , wall_clock.seconds() );
+
+    //------------------------------------
+    // Fill linear system coefficients:
+
+    wall_clock.reset();
+
+    GatherFillFunctor::apply( linsys_matrix , linsys_rhs ,
+               mesh , element_map , elem_matrices , elem_vectors );
+
+    execution_space::fence();
+    perf_data.matrix_gather_fill_time = comm::max( machine , wall_clock.seconds() );
+
+    // Apply boundary conditions:
+
+    wall_clock.reset();
+
+    BoundaryFunctor::apply( linsys_matrix , linsys_rhs , mesh ,
+                            0 , global_max_z , 0 , global_max_z );
+
+    execution_space::fence();
+    perf_data.matrix_boundary_condition_time = comm::max( machine , wall_clock.seconds() );
+  }
+
+  //------------------------------------
+  // Solve linear sytem
+
+  cgsolve( mesh.parallel_data_map ,
+           linsys_matrix , linsys_rhs , linsys_solution ,
+           iteration_count , residual_norm ,
+           perf_data.cg_iteration_time ,
+           iteration_limit , residual_tolerance );
+
+  //------------------------------------
+
+  if ( print_sample ) {
+
+    typename mesh_type::node_coords_type::HostMirror coords_h =
+      Kokkos::create_mirror( mesh.node_coords );
+
+    typename vector_type::HostMirror X_h =
+      Kokkos::create_mirror( linsys_solution );
+
+    Kokkos::deep_copy( coords_h , mesh.node_coords );
+    Kokkos::deep_copy( X_h , linsys_solution );
+
+    for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
+      const coordinate_scalar_type x = coords_h(i,0);
+      const coordinate_scalar_type y = coords_h(i,1);
+      const coordinate_scalar_type z = coords_h(i,2);
+
+      if ( x <= 0 && y <= 0 ) {
+        std::cout << "  node( " << x << " " << y << " " << z << " ) = "
+                  << X_h(i) << std::endl ;
+      }
+    }
+  }
+
+  return perf_data ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class Device >
+void driver( const char * const label ,
+             comm::Machine machine ,
+             const int gang_count ,
+             const int elem_count_beg ,
+             const int elem_count_end ,
+             const int runs )
+{
+  typedef Scalar              scalar_type ;
+  typedef Device              execution_space ;
+  typedef double              coordinate_scalar_type ;
+  typedef FixtureElementHex8  fixture_element_type ;
+
+  typedef BoxMeshFixture< coordinate_scalar_type ,
+                          execution_space ,
+                          fixture_element_type > fixture_type ;
+
+  typedef typename fixture_type::FEMeshType mesh_type ;
+
+  const size_t proc_count = comm::size( machine );
+  const size_t proc_rank  = comm::rank( machine );
+
+  if ( elem_count_beg == 0 || elem_count_end == 0 || runs == 0 ) return ;
+
+  if ( comm::rank( machine ) == 0 ) {
+    std::cout << std::endl ;
+    std::cout << "\"Kokkos::HybridFE::Implicit " << label << "\"" << std::endl;
+    std::cout << "\"Size\" ,  \"Graphing\" , \"Element\" , \"Fill\" ,   \"Boundary\" ,  \"CG-Iter\"" << std::endl
+              << "\"elems\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\"" << std::endl ;
+  }
+
+  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
+  {
+    const int ix = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
+    const int iy = ix + 1 ;
+    const int iz = 2 * iy ;
+    const int n  = ix * iy * iz ;
+
+    mesh_type mesh =
+      fixture_type::create( proc_count , proc_rank , gang_count ,
+                            ix , iy , iz );
+
+    mesh.parallel_data_map.machine = machine ;
+
+    PerformanceData perf_data , perf_best ;
+
+    for(int j = 0; j < runs; j++){
+
+     perf_data = run<scalar_type,fixture_type>(mesh,ix,iy,iz, false );
+
+     if( j == 0 ) {
+       perf_best = perf_data ;
+     }
+     else {
+       perf_best.best( perf_data );
+     }
+   }
+
+  if ( comm::rank( machine ) == 0 ) {
+
+     std::cout << std::setw(8) << n << " , "
+               << std::setw(10) << perf_best.graph_time * 1000 << " , "
+               << std::setw(10) << perf_best.elem_time * 1000 << " , "
+               << std::setw(10) << perf_best.matrix_gather_fill_time * 1000 << " , "
+               << std::setw(10) << perf_best.matrix_boundary_condition_time * 1000 << " , "
+               << std::setw(10) << perf_best.cg_iteration_time * 1000
+               << std::endl ;
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Implicit */
+} /* namespace HybridFEM */
+
+
+#endif /* #ifndef HYBRIDFEM_IMPLICIT_HPP */
+
diff --git a/packages/kokkos/example/multi_fem/ImplicitFunctors.hpp b/packages/kokkos/example/multi_fem/ImplicitFunctors.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4855939bb878bd651eab086d5a9dd2d27ee68426
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/ImplicitFunctors.hpp
@@ -0,0 +1,585 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <cstdlib>
+#include <cmath>
+
+namespace HybridFEM {
+namespace Implicit {
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , unsigned Dim , unsigned N >
+struct TensorIntegration ;
+
+template<typename Scalar >
+struct TensorIntegration<Scalar,1,1> {
+  Scalar pts[1] ;
+  Scalar wts[1] ;
+
+  TensorIntegration() { pts[0] = 0 ; wts[0] = 2 ; }
+};
+
+template<typename Scalar >
+struct TensorIntegration<Scalar,1,2>
+{
+  Scalar pts[2] ;
+  Scalar wts[2] ;
+
+  TensorIntegration()
+  {
+    const Scalar x2 = 0.577350269 ;
+    pts[0] = -x2; wts[0] = 1.0;
+    pts[1] =  x2; wts[1] = 1.0;
+  }
+};
+
+template<typename Scalar >
+struct TensorIntegration<Scalar,1,3>
+{
+  Scalar pts[3] ;
+  Scalar wts[3] ;
+
+  TensorIntegration()
+  {
+    const Scalar x3 = 0.774596669 ;
+    const Scalar w1 = 0.555555556 ;
+    const Scalar w2 = 0.888888889 ;
+    pts[0] =  -x3 ;  wts[0] = w1 ;
+    pts[1] =    0 ;  wts[1] = w2 ;
+    pts[2] =   x3 ;  wts[2] = w1 ;
+  }
+};
+
+template< typename Scalar , unsigned Order >
+struct TensorIntegration<Scalar,3,Order>
+{
+  static const unsigned N = Order * Order * Order ;
+
+  Scalar pts[N][3] ;
+  Scalar wts[N];
+
+  TensorIntegration()
+  {
+    TensorIntegration<Scalar,1,Order> oneD ;
+
+    unsigned n = 0 ;
+    for ( unsigned k = 0 ; k < Order ; ++k ) {
+    for ( unsigned j = 0 ; j < Order ; ++j ) {
+    for ( unsigned i = 0 ; i < Order ; ++i , ++n ) {
+      pts[n][0] = oneD.pts[i] ;
+      pts[n][1] = oneD.pts[j] ;
+      pts[n][2] = oneD.pts[k] ;
+      wts[n] = oneD.wts[i] * oneD.wts[j] * oneD.wts[k] ;
+    }}}
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar >
+struct ShapeFunctionEvaluation {
+
+  static const unsigned FunctionCount = 8 ;
+  static const unsigned SpatialDimension = 3 ;
+  static const unsigned IntegrationOrder = 2 ;
+
+  typedef TensorIntegration< Scalar , SpatialDimension , IntegrationOrder > 
+    TensorIntegrationType ;
+
+  static const unsigned PointCount = TensorIntegrationType::N ;
+
+  Scalar value   [ PointCount ][ FunctionCount ] ;
+  Scalar gradient[ PointCount ][ FunctionCount * SpatialDimension ];
+  Scalar weight  [ PointCount ];
+
+  ShapeFunctionEvaluation()
+  {
+    const TensorIntegration< Scalar , SpatialDimension , IntegrationOrder > 
+      integration ;
+
+    const Scalar ONE8TH = 0.125 ;
+
+    for ( unsigned i = 0 ; i < PointCount ; ++i ) {
+
+      const Scalar u = 1.0 - integration.pts[i][0];
+      const Scalar v = 1.0 - integration.pts[i][1];
+      const Scalar w = 1.0 - integration.pts[i][2];
+
+      const Scalar up1 = 1.0 + integration.pts[i][0];
+      const Scalar vp1 = 1.0 + integration.pts[i][1];
+      const Scalar wp1 = 1.0 + integration.pts[i][2];
+
+      weight[i] = integration.wts[i] ;
+
+      // Vaues:
+      value[i][0] = ONE8TH *   u *   v *  w ;
+      value[i][1] = ONE8TH * up1 *   v *  w ;
+      value[i][2] = ONE8TH * up1 * vp1 *  w ;
+      value[i][3] = ONE8TH *   u * vp1 *  w ;
+
+      value[i][4] = ONE8TH *   u *   v *  wp1 ;
+      value[i][5] = ONE8TH * up1 *   v *  wp1 ;
+      value[i][6] = ONE8TH * up1 * vp1 *  wp1 ;
+      value[i][7] = ONE8TH *   u * vp1 *  wp1 ;
+
+      //fn 0 = u * v * w
+      gradient[i][ 0] = ONE8TH * -1  *  v  *  w  ;
+      gradient[i][ 1] = ONE8TH *  u  * -1  *  w  ;
+      gradient[i][ 2] = ONE8TH *  u  *  v  * -1  ;
+
+      //fn 1 = up1 * v * w
+      gradient[i][ 3] = ONE8TH *  1  *  v  *  w  ;
+      gradient[i][ 4] = ONE8TH * up1 * -1  *  w  ;
+      gradient[i][ 5] = ONE8TH * up1 *  v  * -1  ;
+
+      //fn 2 = up1 * vp1 * w
+      gradient[i][ 6] = ONE8TH *  1  * vp1 *  w ;
+      gradient[i][ 7] = ONE8TH * up1 *  1  *  w ;
+      gradient[i][ 8] = ONE8TH * up1 * vp1 * -1 ;
+
+      //fn 3 = u * vp1 * w
+      gradient[i][ 9] = ONE8TH * -1 * vp1 *  w ;
+      gradient[i][10] = ONE8TH *  u *  1  *  w ;
+      gradient[i][11] = ONE8TH *  u * vp1 * -1 ;
+
+      //fn 4 = u * v * wp1
+      gradient[i][12] = ONE8TH * -1  *  v  * wp1 ;
+      gradient[i][13] = ONE8TH *  u  * -1  * wp1 ;
+      gradient[i][14] = ONE8TH *  u  *  v  *  1  ;
+
+      //fn 5 = up1 * v * wp1
+      gradient[i][15] = ONE8TH *  1  *  v  * wp1 ;
+      gradient[i][16] = ONE8TH * up1 * -1  * wp1 ;
+      gradient[i][17] = ONE8TH * up1 *  v  *  1  ;
+
+      //fn 6 = up1 * vp1 * wp1
+      gradient[i][18] = ONE8TH *  1  * vp1 * wp1 ;
+      gradient[i][19] = ONE8TH * up1 *  1  * wp1 ;
+      gradient[i][20] = ONE8TH * up1 * vp1 *  1 ;
+
+      //fn 7 = u * vp1 * wp1
+      gradient[i][21] = ONE8TH * -1 * vp1 * wp1 ;
+      gradient[i][22] = ONE8TH *  u *  1  * wp1 ;
+      gradient[i][23] = ONE8TH *  u * vp1 *  1 ;
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarType , typename ScalarCoordType , class DeviceType >
+struct ElementComputation
+{
+  typedef DeviceType     execution_space;
+  typedef ScalarType              scalar_type ;
+  typedef typename execution_space::size_type  size_type ;
+
+  static const size_type ElementNodeCount = 8 ;
+
+  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
+  typedef Kokkos::View< scalar_type[][ElementNodeCount][ElementNodeCount] , execution_space > elem_matrices_type ;
+  typedef Kokkos::View< scalar_type[][ElementNodeCount] , execution_space > elem_vectors_type ;
+
+  typedef ShapeFunctionEvaluation< scalar_type > shape_function_data ;
+
+  static const unsigned SpatialDim    = shape_function_data::SpatialDimension ;
+  static const unsigned FunctionCount = shape_function_data::FunctionCount ;
+
+private:
+
+  const shape_function_data               shape_eval ;
+  typename mesh_type::elem_node_ids_type  elem_node_ids ;
+  typename mesh_type::node_coords_type    node_coords ;
+  elem_matrices_type                      element_matrices ;
+  elem_vectors_type                       element_vectors ;
+  scalar_type                             coeff_K ;
+  scalar_type                             coeff_Q ;
+
+  ElementComputation( const mesh_type   & arg_mesh ,
+                      const elem_matrices_type  & arg_element_matrices , 
+                      const elem_vectors_type   & arg_element_vectors ,
+                      const scalar_type   arg_coeff_K ,
+                      const scalar_type   arg_coeff_Q )
+  : shape_eval()
+  , elem_node_ids( arg_mesh.elem_node_ids )
+  , node_coords(   arg_mesh.node_coords )
+  , element_matrices( arg_element_matrices )
+  , element_vectors( arg_element_vectors )
+  , coeff_K( arg_coeff_K )
+  , coeff_Q( arg_coeff_Q )
+  {}
+
+public:
+
+  static void apply( const mesh_type  & mesh ,
+                     const elem_matrices_type & elem_matrices ,
+                     const elem_vectors_type  & elem_vectors ,
+                     const scalar_type  elem_coeff_K ,
+                     const scalar_type  elem_coeff_Q )
+  {
+    ElementComputation comp( mesh , elem_matrices , elem_vectors , elem_coeff_K , elem_coeff_Q );
+    const size_t elem_count = mesh.elem_node_ids.dimension_0();
+
+    parallel_for( elem_count , comp );
+  }
+
+  //------------------------------------
+
+  static const unsigned FLOPS_jacobian =
+    FunctionCount * SpatialDim * SpatialDim * 2 ;
+
+  KOKKOS_INLINE_FUNCTION
+  void jacobian( const ScalarCoordType * x, 
+                 const ScalarCoordType * y, 
+                 const ScalarCoordType * z, 
+                 const scalar_type * grad_vals, 
+                 scalar_type * J) const
+  {
+    int i_grad = 0 ;
+
+    for( unsigned i = 0; i < ElementNodeCount ; ++i , i_grad += SpatialDim ) {
+      const scalar_type g0 = grad_vals[ i_grad ];
+      const scalar_type g1 = grad_vals[ i_grad + 1 ];
+      const scalar_type g2 = grad_vals[ i_grad + 2 ];
+      const scalar_type x0 = x[i] ;
+      const scalar_type x1 = y[i] ;
+      const scalar_type x2 = z[i] ;
+
+      J[0] += g0 * x0 ;
+      J[1] += g0 * x1 ;
+      J[2] += g0 * x2 ;
+
+      J[3] += g1 * x0 ;
+      J[4] += g1 * x1 ;
+      J[5] += g1 * x2 ;
+
+      J[6] += g2 * x0 ;
+      J[7] += g2 * x1 ;
+      J[8] += g2 * x2 ;
+    }
+  }
+
+  //------------------------------------
+
+  static const unsigned FLOPS_inverse_and_det = 46 ;
+
+  KOKKOS_INLINE_FUNCTION
+  scalar_type inverse_and_determinant3x3( scalar_type * const J ) const
+  {
+    const scalar_type J00 = J[0];
+    const scalar_type J01 = J[1];
+    const scalar_type J02 = J[2];
+
+    const scalar_type J10 = J[3];
+    const scalar_type J11 = J[4];
+    const scalar_type J12 = J[5];
+
+    const scalar_type J20 = J[6];
+    const scalar_type J21 = J[7];
+    const scalar_type J22 = J[8];
+
+    const scalar_type term0 = J22*J11 - J21*J12;
+    const scalar_type term1 = J22*J01 - J21*J02;
+    const scalar_type term2 = J12*J01 - J11*J02;
+
+    const scalar_type detJ = J00*term0 - J10*term1 + J20*term2;
+    const scalar_type inv_detJ = 1.0/detJ;
+
+    J[0] =  term0*inv_detJ;
+    J[1] = -term1*inv_detJ;
+    J[2] =  term2*inv_detJ;
+
+    J[3] = -(J22*J10 - J20*J12)*inv_detJ;
+    J[4] =  (J22*J00 - J20*J02)*inv_detJ;
+    J[5] = -(J12*J00 - J10*J02)*inv_detJ;
+
+    J[6] =  (J21*J10 - J20*J11)*inv_detJ;
+    J[7] = -(J21*J00 - J20*J01)*inv_detJ;
+    J[8] =  (J11*J00 - J10*J01)*inv_detJ;
+
+    return detJ ;
+  }
+
+  //------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void matTransMat3x3_X_3xn( const scalar_type * A, int n,
+                             const scalar_type * B,
+                             scalar_type * C ) const
+  {
+    //A is 3x3, B is 3xn. So C is also 3xn.
+    //A,B,C are all assumed to be ordered such that columns are contiguous.
+
+    scalar_type * Cj = C;
+    const scalar_type * Bj = B;
+
+    for(int j=0; j<n; ++j) {
+      Cj[0] = A[0]*Bj[0] + A[1]*Bj[1] + A[2]*Bj[2];
+      Cj[1] = A[3]*Bj[0] + A[4]*Bj[1] + A[5]*Bj[2];
+      Cj[2] = A[6]*Bj[0] + A[7]*Bj[1] + A[8]*Bj[2];
+      Bj += 3;
+      Cj += 3;
+    }
+
+  }
+  //------------------------------------
+
+  static const unsigned FLOPS_contributeDiffusionMatrix = FunctionCount * ( 3 * 5 + FunctionCount * 7 ) ;
+
+  KOKKOS_INLINE_FUNCTION
+  void contributeDiffusionMatrix(
+    const scalar_type weight ,
+    const scalar_type grad_vals[] ,
+    const scalar_type invJ[] ,
+    scalar_type elem_mat[][8] ) const
+  {
+    scalar_type dpsidx[8], dpsidy[8], dpsidz[8];
+
+    int i_grad = 0 ;
+    for( unsigned i = 0; i < FunctionCount ; ++i , i_grad += 3 ) {
+      const scalar_type g0 = grad_vals[i_grad+0];
+      const scalar_type g1 = grad_vals[i_grad+1];
+      const scalar_type g2 = grad_vals[i_grad+2];
+
+      dpsidx[i] = g0 * invJ[0] + g1 * invJ[1] + g2 * invJ[2];
+      dpsidy[i] = g0 * invJ[3] + g1 * invJ[4] + g2 * invJ[5];
+      dpsidz[i] = g0 * invJ[6] + g1 * invJ[7] + g2 * invJ[8];
+    }
+
+    for( unsigned m = 0; m < FunctionCount; m++) {
+      for( unsigned n = 0; n < FunctionCount; n++) {
+
+        elem_mat[m][n] += weight * 
+          ((dpsidx[m] * dpsidx[n]) + 
+           (dpsidy[m] * dpsidy[n]) +
+           (dpsidz[m] * dpsidz[n]));            
+      }
+    }
+  }
+
+  //------------------------------------
+
+  static const unsigned FLOPS_contributeSourceVector = FunctionCount * 2 ;
+
+  KOKKOS_INLINE_FUNCTION
+  void contributeSourceVector( const scalar_type term ,
+                               const scalar_type psi[] ,
+                               scalar_type elem_vec[] ) const
+  {
+     for( unsigned i=0; i< FunctionCount ; ++i) {
+       elem_vec[i] += psi[i] * term ;
+     }
+  }
+
+
+  static const unsigned FLOPS_operator =
+           shape_function_data::PointCount * ( 3
+             + FLOPS_jacobian
+             + FLOPS_inverse_and_det
+             + FLOPS_contributeDiffusionMatrix
+             + FLOPS_contributeSourceVector ) ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ielem )const {
+
+    scalar_type elem_vec[8] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
+    scalar_type elem_mat[8][8] =
+      { { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } };
+
+    ScalarCoordType x[8], y[8], z[8];
+
+    for ( int i = 0 ; i < 8 ; ++i ) {
+      const int node_index = elem_node_ids( ielem , i );
+      x[i] = node_coords( node_index , 0 );
+      y[i] = node_coords( node_index , 1 );
+      z[i] = node_coords( node_index , 2 );
+    }
+
+    // This loop could be parallelized; however,
+    // it would require additional per-thread temporaries
+    // of 'elem_vec' and 'elem_mat' which would
+    // consume more local memory and have to be reduced.
+
+    for ( unsigned i = 0 ; i < shape_function_data::PointCount ; ++i ) {
+
+      scalar_type J[SpatialDim*SpatialDim] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
+
+      jacobian( x, y, z, shape_eval.gradient[i] , J );
+
+      // Overwrite J with its inverse to save scratch memory space.
+      const scalar_type detJ_w   = shape_eval.weight[i] * inverse_and_determinant3x3(J);
+      const scalar_type k_detJ_w = coeff_K * detJ_w ;
+      const scalar_type Q_detJ_w = coeff_Q * detJ_w ;
+
+      contributeDiffusionMatrix( k_detJ_w , shape_eval.gradient[i] , J , elem_mat );
+
+      contributeSourceVector( Q_detJ_w , shape_eval.value[i] , elem_vec );
+    }
+
+    for( size_type i=0; i< ElementNodeCount ; ++i) {
+      element_vectors(ielem, i) = elem_vec[i] ;
+    }
+
+    for( size_type i = 0; i < ElementNodeCount ; i++){
+      for( size_type j = 0; j < ElementNodeCount ; j++){
+        element_matrices(ielem, i, j) = elem_mat[i][j] ;
+      }
+    }
+  }
+}; /* ElementComputation */
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarType , typename ScalarCoordType , class DeviceType >
+struct DirichletBoundary
+{
+  typedef DeviceType     execution_space;
+  typedef typename execution_space::size_type  size_type ;
+
+  static const size_type ElementNodeCount = 8 ;
+
+  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
+  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
+
+  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
+
+  typename mesh_type::node_coords_type node_coords ;
+  matrix_type     matrix ;
+  vector_type     rhs ;
+  ScalarCoordType bc_lower_z ;
+  ScalarCoordType bc_upper_z ;
+  ScalarType      bc_lower_value ;
+  ScalarType      bc_upper_value ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type inode ) const
+  {
+    //  Apply a dirichlet boundary condition to 'irow'
+    //  to maintain the symmetry of the original 
+    //  global stiffness matrix, zero out the columns
+    //  that correspond to boundary conditions, and
+    //  adjust the load vector accordingly
+
+    const size_type iBeg = matrix.graph.row_map[inode];
+    const size_type iEnd = matrix.graph.row_map[inode+1];
+
+    const ScalarCoordType z = node_coords(inode,2);
+    const bool bc_lower = z <= bc_lower_z ;
+    const bool bc_upper = bc_upper_z <= z ;
+
+    if ( bc_lower || bc_upper ) {
+      const ScalarType bc_value = bc_lower ? bc_lower_value
+                                           : bc_upper_value ;
+
+      rhs(inode) = bc_value ; //  set the rhs vector
+
+      //  zero each value on the row, and leave a one
+      //  on the diagonal
+
+      for( size_type i = iBeg ; i < iEnd ; i++) {
+        matrix.coefficients(i) =
+          (int) inode == matrix.graph.entries(i) ? 1 : 0 ;
+      }
+    }
+    else {
+      //  Find any columns that are boundary conditions.
+      //  Clear them and adjust the load vector
+
+      for( size_type i = iBeg ; i < iEnd ; i++ ) {
+        const size_type cnode = matrix.graph.entries(i) ;
+
+        const ScalarCoordType zc = node_coords(cnode,2);
+        const bool c_bc_lower = zc <= bc_lower_z ;
+        const bool c_bc_upper = bc_upper_z <= zc ;
+
+        if ( c_bc_lower || c_bc_upper ) {
+
+          const ScalarType c_bc_value = c_bc_lower ? bc_lower_value
+                                                   : bc_upper_value ;
+
+          rhs( inode ) -= c_bc_value * matrix.coefficients(i);
+
+          matrix.coefficients(i) = 0 ;
+        }
+      }
+    }
+  }
+
+
+  static void apply( const matrix_type & linsys_matrix ,
+                     const vector_type & linsys_rhs ,
+                     const mesh_type   & mesh ,
+                     const ScalarCoordType  bc_lower_z ,
+                     const ScalarCoordType  bc_upper_z ,
+                     const ScalarType       bc_lower_value ,
+                     const ScalarType       bc_upper_value )
+  {
+    const size_t row_count = linsys_matrix.graph.row_map.dimension_0() - 1 ;
+    DirichletBoundary op ;
+    op.node_coords    = mesh.node_coords ;
+    op.matrix         = linsys_matrix ;
+    op.rhs            = linsys_rhs ;
+    op.bc_lower_z     = bc_lower_z ;
+    op.bc_upper_z     = bc_upper_z ;
+    op.bc_lower_value = bc_lower_value ;
+    op.bc_upper_value = bc_upper_value ;
+    parallel_for( row_count , op );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} /* namespace Implicit */
+} /* namespace HybridFEM */
+
diff --git a/packages/kokkos/example/multi_fem/LinAlgBLAS.hpp b/packages/kokkos/example/multi_fem/LinAlgBLAS.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa03e200800fdf3eceb5147aab55d6a1d5b8a17b
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/LinAlgBLAS.hpp
@@ -0,0 +1,567 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef USESCASES_LINALG_BLAS_HPP
+#define USESCASES_LINALG_BLAS_HPP
+
+#include <cmath>
+#include <utility>
+#include <ParallelComm.hpp>
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class Scalar , class Layout , class DeviceType > struct Dot ;
+
+template< class Scalar , class Layout , class DeviceType > struct Dot1 ;
+
+template< typename ScalarA ,
+          typename ScalarY ,
+          class Layout , class Device >
+struct Scale ;
+
+template< typename ScalarA ,
+          typename ScalarY ,
+          class Layout , class Device >
+struct Fill ;
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          typename ScalarY ,
+          class Layout , class Device >
+struct AXPY ;
+
+template< typename ScalarX ,
+          typename ScalarB ,
+          typename ScalarY ,
+          class Layout , class Device >
+struct XPBY ;
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          typename ScalarB ,
+          typename ScalarY ,
+          typename ScalarW ,
+          class Layout , class Device >
+struct WAXPBY ;
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_MPI )
+
+template< typename ScalarX /* Allow mix of const and non-const */ ,
+          typename ScalarY /* Allow mix of const and non-const */ ,
+          class L , class D ,
+          class MX /* Allow any management type */ ,
+          class MY /* Allow any management type */ >
+inline
+double dot( const size_t n ,
+            const View< ScalarX * , L , D , MX > & x ,
+            const View< ScalarY * , L , D , MY > & y ,
+            comm::Machine machine )
+{
+  double global_result = 0 ;
+  double local_result = 0 ;
+
+  Impl::Dot< ScalarX , L , D >( n , x , y , local_result );
+
+  MPI_Allreduce( & local_result , & global_result , 1 ,
+                 MPI_DOUBLE , MPI_SUM , machine.mpi_comm );
+
+  return global_result ;
+}
+
+#else
+
+template< typename ScalarX /* Allow mix of const and non-const */ ,
+          typename ScalarY /* Allow mix of const and non-const */ ,
+          class L , class D ,
+          class MX /* Allow any management type */ ,
+          class MY /* Allow any management type */ >
+inline
+double dot( const size_t n ,
+            const View< ScalarX * , L , D , MX > & x ,
+            const View< ScalarY * , L , D , MY > & y ,
+            comm::Machine )
+{
+  double global_result = 0 ;
+
+  Impl::Dot< ScalarX , L , D >( n , x , y , global_result );
+
+  return global_result ;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_MPI )
+
+template< typename ScalarX /* Allow mix of const and non-const */ ,
+          class L , class D ,
+          class MX /* Allow any management type */ >
+inline
+double dot( const size_t n ,
+            const View< ScalarX * , L , D , MX > & x ,
+            comm::Machine machine )
+{
+  double global_result = 0 ;
+  double local_result = 0 ;
+
+  Impl::Dot1< ScalarX , L , D >( n , x , local_result );
+
+  MPI_Allreduce( & local_result , & global_result , 1 ,
+                 MPI_DOUBLE , MPI_SUM , machine.mpi_comm );
+
+  return global_result ;
+}
+
+#else
+
+template< typename ScalarX /* Allow mix of const and non-const */ ,
+          class L , class D ,
+          class MX /* Allow any management type */ >
+inline
+double dot( const size_t n ,
+            const View< ScalarX * , L , D , MX > & x ,
+            comm::Machine )
+{
+  double global_result = 0 ;
+
+  Impl::Dot1< ScalarX , L , D >( n , x , global_result );
+
+  return global_result ;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarX /* Allow mix of const and non-const */ ,
+          class L , class D ,
+          class MX /* Allow any management type */ >
+inline
+double norm2( const size_t n ,
+              const View< ScalarX * , L , D , MX > & x ,
+              comm::Machine machine )
+{
+  return std::sqrt( dot( n , x , machine ) );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          class L ,
+          class D ,
+          class MX >
+void scale( const size_t n ,
+            const ScalarA & alpha ,
+            const View< ScalarX * , L , D , MX > & x )
+{
+  Impl::Scale< ScalarA , ScalarX , L , D >( n , alpha , x );
+}
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          class L ,
+          class D ,
+          class MX >
+void fill( const size_t n ,
+           const ScalarA & alpha ,
+           const View< ScalarX * , L , D , MX > & x )
+{
+  Impl::Fill< ScalarA , ScalarX , L , D >( n , alpha , x );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          typename ScalarY ,
+          class L ,
+          class D ,
+          class MX ,
+          class MY >
+void axpy( const size_t n ,
+           const ScalarA & alpha ,
+           const View< ScalarX *, L , D , MX > & x ,
+           const View< ScalarY *, L , D , MY > & y )
+{
+  Impl::AXPY< ScalarA, ScalarX, ScalarY , L , D >( n, alpha, x, y );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarX ,
+          typename ScalarB ,
+          typename ScalarY ,
+          class L ,
+          class D ,
+          class MX ,
+          class MY >
+void xpby( const size_t n ,
+           const View< ScalarX *, L , D , MX > & x ,
+           const ScalarB & beta ,
+           const View< ScalarY *, L , D , MY > & y )
+{
+  Impl::XPBY< ScalarX, ScalarB, ScalarY , L , D >( n, x, beta, y );
+}
+
+//----------------------------------------------------------------------------
+// w = alpha * x + beta * y
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          typename ScalarB ,
+          typename ScalarY ,
+          typename ScalarW ,
+          class L , class D ,
+          class MX , class MY , class MW >
+void waxpby( const size_t n ,
+             const ScalarA & alpha ,
+             const View< ScalarX * , L , D , MX > & x ,
+             const ScalarB & beta ,
+             const View< ScalarY * , L , D , MY > & y ,
+             const View< ScalarW * , L , D , MW > & w )
+{
+  Impl::WAXPBY<ScalarA,ScalarX,ScalarB,ScalarY,ScalarW,L,D>
+    ( n , alpha , x , beta , y , w );
+}
+
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename Scalar , class L , class D >
+struct Dot
+{
+private:
+
+  typedef View< const Scalar*, L, D, MemoryUnmanaged >  vector_const_type ;
+
+  const vector_const_type x ;
+  const vector_const_type y ;
+
+public:
+
+  typedef typename vector_const_type::execution_space  execution_space ; // Manycore device
+  typedef double      value_type ;  // Reduction value
+
+  template< class ArgX , class ArgY >
+  inline
+  Dot( const size_t n , const ArgX & arg_x , const ArgY & arg_y , double & result )
+    : x( arg_x ), y( arg_y )
+  {
+    parallel_reduce( n , *this , result );
+  }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i , value_type & update ) const
+  { update += x(i) * y(i); }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+  { update += source;    }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0 ; }
+}; // Dot
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class L , class D >
+struct Dot1
+{
+private:
+
+  typedef View< const Scalar*, L, D , MemoryUnmanaged >  vector_const_type ;
+
+  const vector_const_type x ;
+
+public:
+
+  typedef typename vector_const_type::execution_space  execution_space ; // Manycore device
+  typedef double      value_type ;  // Reduction value
+
+  template< class ArgX >
+  inline
+  Dot1( const size_t n , const ArgX & arg_x , double & result )
+    : x( arg_x )
+  {
+    parallel_reduce( n , *this , result );
+  }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i , value_type & update ) const
+  { update += x(i) * x(i) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+  { update += source ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0 ; }
+}; // Dot
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template < typename ScalarA ,
+           typename ScalarX ,
+           typename ScalarB ,
+           typename ScalarY ,
+           typename ScalarW ,
+           class L , class D >
+struct WAXPBY
+{
+private:
+
+  typedef View<       ScalarW *, L , D , MemoryUnmanaged > ViewW ;
+  typedef View< const ScalarX *, L , D , MemoryUnmanaged > ViewX ;
+  typedef View< const ScalarY *, L , D , MemoryUnmanaged > ViewY ;
+
+  const ViewW    w ;
+  const ViewX    x ;
+  const ViewY    y ;
+  const ScalarA  alpha ;
+  const ScalarB  beta ;
+
+public:
+
+  typedef typename ViewW::execution_space  execution_space ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType inode ) const
+  {
+    w(inode) = alpha * x(inode) + beta * y(inode);
+  }
+
+  template< class ArgX , class ArgY , class ArgW >
+  inline
+  WAXPBY( const size_t  n ,
+          const ScalarA & arg_alpha ,
+          const ArgX    & arg_x ,
+          const ScalarB & arg_beta ,
+          const ArgY    & arg_y ,
+          const ArgW    & arg_w )
+    : w( arg_w ), x( arg_x ), y( arg_y )
+    , alpha( arg_alpha ), beta( arg_beta )
+  {
+    parallel_for( n , *this );
+  }
+}; // WAXPBY
+
+//----------------------------------------------------------------------------
+
+template < typename ScalarB ,
+           typename ScalarW ,
+           class L , class D >
+struct Scale
+{
+private:
+
+  typedef View< ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
+  const ViewW    w ;
+  const ScalarB  beta ;
+
+public:
+
+  typedef typename ViewW::execution_space  execution_space ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i ) const
+  { w(i) *= beta ; }
+
+  template< class ArgW >
+  inline
+  Scale( const size_t n , const ScalarB & arg_beta , const ArgW & arg_w )
+    : w( arg_w )
+    , beta( arg_beta )
+  {
+    parallel_for( n , *this );
+  }
+};
+
+template < typename ScalarB ,
+           typename ScalarW ,
+           class L , class D >
+struct Fill
+{
+private:
+
+  typedef View< ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
+  const ViewW    w ;
+  const ScalarB  beta ;
+
+public:
+
+  typedef typename ViewW::execution_space  execution_space ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i ) const
+  { w(i) = beta ; }
+
+  template< class ArgW >
+  inline
+  Fill( const size_t n , const ScalarB & arg_beta , const ArgW & arg_w )
+    : w( arg_w )
+    , beta( arg_beta )
+  {
+    parallel_for( n , *this );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template < typename ScalarA ,
+           typename ScalarX ,
+           typename ScalarW ,
+           class L , class D >
+struct AXPY
+{
+private:
+
+  typedef View<       ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
+  typedef View< const ScalarX *, L , D , MemoryUnmanaged >  ViewX ;
+
+  const ViewW    w ;
+  const ViewX    x ;
+  const ScalarA  alpha ;
+
+public:
+
+  typedef typename ViewW::execution_space  execution_space ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i ) const
+  { w(i) += alpha * x(i); }
+
+  template< class ArgX , class ArgW >
+  inline
+  AXPY( const size_t  n ,
+        const ScalarA & arg_alpha ,
+        const ArgX    & arg_x ,
+        const ArgW    & arg_w )
+    : w( arg_w ), x( arg_x )
+    , alpha( arg_alpha )
+  {
+    parallel_for( n , *this );
+  }
+}; // AXPY
+
+template< typename ScalarX ,
+          typename ScalarB ,
+          typename ScalarW ,
+          class L , class D >
+struct XPBY
+{
+private:
+
+  typedef View<       ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
+  typedef View< const ScalarX *, L , D , MemoryUnmanaged >  ViewX ;
+
+  const ViewW    w ;
+  const ViewX    x ;
+  const ScalarB  beta ;
+
+public:
+
+  typedef typename ViewW::execution_space  execution_space ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i ) const
+  { w(i) = x(i) + beta * w(i); }
+
+  template< class ArgX , class ArgW >
+  inline
+  XPBY( const size_t  n ,
+        const ArgX    & arg_x ,
+        const ScalarB & arg_beta ,
+        const ArgW    & arg_w )
+    : w( arg_w ), x( arg_x )
+    , beta( arg_beta )
+  {
+    parallel_for( n , *this );
+  }
+}; // XPBY
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef USESCASES_LINALG_BLAS_HPP */
+
+
diff --git a/packages/kokkos/example/multi_fem/Makefile b/packages/kokkos/example/multi_fem/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4b114b56255f152206adee8dbc8979ae9015050f
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/Makefile
@@ -0,0 +1,49 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+CXXFLAGS = -O3 -I$(SRC_DIR)
+LDFLAGS ?=
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+  EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+  CXXFLAGS += -I$(SRC_DIR) -I$(CUDA_PATH) -O3
+  LDFLAGS += -L$(CUDA_PATH)/lib64 -lcusparse
+else
+  CXX = g++
+  EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+endif
+
+LINK ?= $(CXX)
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/packages/kokkos/example/multi_fem/Nonlinear.hpp b/packages/kokkos/example/multi_fem/Nonlinear.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7e91529e1c53b92e912a0a6597aa784952ecd8ba
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/Nonlinear.hpp
@@ -0,0 +1,573 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef HYBRIDFEM_NONLINEAR_HPP
+#define HYBRIDFEM_NONLINEAR_HPP
+
+#include <utility>
+#include <iostream>
+#include <iomanip>
+
+#include <Kokkos_Core.hpp>
+#include <SparseLinearSystem.hpp>
+#include <SparseLinearSystemFill.hpp>
+#include <NonlinearFunctors.hpp>
+
+#include <FEMesh.hpp>
+#include <HexElement.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace HybridFEM {
+namespace Nonlinear {
+
+struct PerformanceData {
+  double mesh_time ;
+  double graph_time ;
+  double elem_time ;
+  double matrix_gather_fill_time ;
+  double matrix_boundary_condition_time ;
+  double cg_iteration_time ;
+  size_t cg_iteration_count ;
+  size_t newton_iteration_count ;
+  double error_max ;
+
+  PerformanceData()
+    : mesh_time(0)
+    , graph_time(0)
+    , elem_time(0)
+    , matrix_gather_fill_time(0)
+    , matrix_boundary_condition_time(0)
+    , cg_iteration_time(0)
+    , cg_iteration_count(0)
+    , newton_iteration_count(0)
+    , error_max(0)
+    {}
+
+  void best( const PerformanceData & rhs )
+  {
+    mesh_time = std::min( mesh_time , rhs.mesh_time );
+    graph_time = std::min( graph_time , rhs.graph_time );
+    elem_time = std::min( elem_time , rhs.elem_time );
+    matrix_gather_fill_time = std::min( matrix_gather_fill_time , rhs.matrix_gather_fill_time );
+    matrix_boundary_condition_time = std::min( matrix_boundary_condition_time , rhs.matrix_boundary_condition_time );
+    cg_iteration_time = std::min( cg_iteration_time , rhs.cg_iteration_time );
+    cg_iteration_count = std::min( cg_iteration_count , rhs.cg_iteration_count );
+    newton_iteration_count = std::min( newton_iteration_count , rhs.newton_iteration_count );
+    error_max = std::min( error_max , rhs.error_max );
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+class ManufacturedSolution {
+public:
+
+  // Manufactured solution for one dimensional nonlinear PDE
+  //
+  //  -K T_zz + T^2 = 0 ; T(zmin) = T_zmin ; T(zmax) = T_zmax
+  //
+  //  Has an analytic solution of the form:
+  //
+  //    T(z) = ( a ( z - zmin ) + b )^(-2) where K = 1 / ( 6 a^2 )
+  //
+  //  Given T_0 and T_L compute K for this analytic solution.
+  //
+  //  Two analytic solutions:
+  //
+  //    Solution with singularity:
+  //    , a( ( 1.0 / sqrt(T_zmax) + 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
+  //    , b( -1.0 / sqrt(T_zmin) )
+  //
+  //    Solution without singularity:
+  //    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
+  //    , b( 1.0 / sqrt(T_zmin) )
+
+  const double zmin ;
+  const double zmax ;
+  const double T_zmin ;
+  const double T_zmax ;
+  const double a ;
+  const double b ;
+  const double K ;
+
+  ManufacturedSolution( const double arg_zmin ,
+                        const double arg_zmax ,
+                        const double arg_T_zmin ,
+                        const double arg_T_zmax )
+    : zmin( arg_zmin )
+    , zmax( arg_zmax )
+    , T_zmin( arg_T_zmin )
+    , T_zmax( arg_T_zmax )
+    , a( ( 1.0 / std::sqrt(T_zmax) - 1.0 / std::sqrt(T_zmin) ) / ( zmax - zmin ) )
+    , b( 1.0 / std::sqrt(T_zmin) )
+    , K( 1.0 / ( 6.0 * a * a ) )
+    {}
+
+  double operator()( const double z ) const
+  {
+    const double tmp = a * ( z - zmin ) + b ;
+    return 1.0 / ( tmp * tmp );
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class FixtureType >
+PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
+                     const int , // global_max_x ,
+                     const int , // global_max_y ,
+                     const int global_max_z ,
+                     const bool print_error )
+{
+  typedef Scalar                              scalar_type ;
+  typedef FixtureType                         fixture_type ;
+  typedef typename fixture_type::execution_space  execution_space;
+  //typedef typename execution_space::size_type     size_type ; // unused
+
+  typedef typename fixture_type::FEMeshType mesh_type ;
+  typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;
+
+  enum { ElementNodeCount = fixture_type::element_node_count };
+
+  const comm::Machine machine = mesh.parallel_data_map.machine ;
+
+  const size_t element_count = mesh.elem_node_ids.dimension_0();
+
+  //------------------------------------
+  // The amount of nonlinearity is proportional to the ratio
+  // between T(zmax) and T(zmin).  For the manufactured solution
+  // 0 < T(zmin) and 0 < T(zmax)
+
+  const ManufacturedSolution
+    exact_solution( /* zmin */ 0 ,
+                    /* zmax */ global_max_z ,
+                    /* T(zmin) */ 1 ,
+                    /* T(zmax) */ 20 );
+
+  //-----------------------------------
+  // Convergence Criteria and perf data:
+
+  const size_t cg_iteration_limit = 200 ;
+  const double cg_tolerance = 1e-14 ;
+
+  const size_t newton_iteration_limit = 150 ;
+  const double newton_tolerance = 1e-14 ;
+
+  size_t cg_iteration_count_total = 0 ;
+  double cg_iteration_time = 0 ;
+
+  size_t newton_iteration_count = 0 ;
+  double residual_norm_init = 0 ;
+  double residual_norm = 0 ;
+
+  PerformanceData perf_data ;
+
+  //------------------------------------
+  // Sparse linear system types:
+
+  typedef Kokkos::View< scalar_type* , execution_space >     vector_type ;
+  typedef Kokkos::CrsMatrix< scalar_type , execution_space >  matrix_type ;
+  typedef typename matrix_type::graph_type                matrix_graph_type ;
+  typedef typename matrix_type::coefficients_type         matrix_coefficients_type ;
+
+  typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;
+
+  //------------------------------------
+  // Problem setup types:
+
+  typedef ElementComputation < mesh_type , scalar_type > ElementFunctor ;
+  typedef DirichletSolution  < mesh_type , scalar_type > DirichletSolutionFunctor ;
+  typedef DirichletResidual  < mesh_type , scalar_type > DirichletResidualFunctor ;
+
+  typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
+  typedef typename ElementFunctor::elem_vectors_type  elem_vectors_type ;
+
+  typedef GatherFill< matrix_type ,
+                      mesh_type ,
+                      elem_matrices_type ,
+                      elem_vectors_type > GatherFillFunctor ;
+
+  //------------------------------------
+
+  matrix_type jacobian ;
+  vector_type residual ;
+  vector_type delta ;
+  vector_type nodal_solution ;
+
+  typename graph_factory::element_map_type element_map ;
+
+  //------------------------------------
+  // Generate mesh and corresponding sparse matrix graph
+
+  Kokkos::Timer wall_clock ;
+
+  //------------------------------------
+  // Generate sparse matrix graph and element->graph map.
+
+  wall_clock.reset();
+
+  graph_factory::create( mesh , jacobian.graph , element_map );
+
+  execution_space::fence();
+
+  perf_data.graph_time = comm::max( machine , wall_clock.seconds() );
+
+  //------------------------------------
+  // Allocate linear system coefficients and rhs:
+
+  const size_t local_owned_length = jacobian.graph.row_map.dimension_0() - 1 ;
+  const size_t local_total_length = mesh.node_coords.dimension_0();
+
+  jacobian.coefficients =
+    matrix_coefficients_type( "jacobian_coeff" , jacobian.graph.entries.dimension_0() );
+
+  // Nonlinear residual for owned nodes:
+  residual = vector_type( "residual" , local_owned_length );
+
+  // Nonlinear solution for owned and ghosted nodes:
+  nodal_solution = vector_type( "solution" , local_total_length );
+
+  // Nonlinear solution update for owned nodes:
+  delta = vector_type( "delta" , local_owned_length );
+
+  //------------------------------------
+  // Allocation of arrays to fill the linear system
+
+  elem_matrices_type elem_matrices ; // Jacobian matrices
+  elem_vectors_type  elem_vectors ;  // Residual vectors
+
+  if ( element_count ) {
+    elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
+    elem_vectors = elem_vectors_type( std::string("elem_vectors"), element_count );
+  }
+
+  //------------------------------------
+  // For boundary condition set the correct values in the solution vector
+  //   The 'zmin' face is assigned to 'T_zmin'.
+  //   The 'zmax' face is assigned to 'T_zmax'.
+  //   The resulting solution is one dimensional along the 'Z' axis.
+
+  DirichletSolutionFunctor::apply( nodal_solution , mesh ,
+                                   exact_solution.zmin ,
+                                   exact_solution.zmax ,
+                                   exact_solution.T_zmin ,
+                                   exact_solution.T_zmax );
+
+  for(;;) { // Nonlinear loop
+
+#if defined( KOKKOS_ENABLE_MPI )
+
+    { //------------------------------------
+      // Import off-processor nodal solution values
+      // for residual and jacobian computations
+
+      Kokkos::AsyncExchange< typename vector_type::value_type , execution_space ,
+                                  Kokkos::ParallelDataMap >
+        exchange( mesh.parallel_data_map , 1 );
+
+      Kokkos::PackArray< vector_type >
+        ::pack( exchange.buffer() ,
+                mesh.parallel_data_map.count_interior ,
+                mesh.parallel_data_map.count_send ,
+                nodal_solution );
+
+      exchange.setup();
+
+      exchange.send_receive();
+
+      Kokkos::UnpackArray< vector_type >
+        ::unpack( nodal_solution , exchange.buffer() ,
+                  mesh.parallel_data_map.count_owned ,
+                  mesh.parallel_data_map.count_receive );
+    }
+
+#endif
+
+    //------------------------------------
+    // Compute element matrices and vectors:
+
+    wall_clock.reset();
+
+    ElementFunctor( mesh ,
+                    elem_matrices ,
+                    elem_vectors ,
+                    nodal_solution ,
+                    exact_solution.K );
+
+    execution_space::fence();
+    perf_data.elem_time += comm::max( machine , wall_clock.seconds() );
+
+    //------------------------------------
+    // Fill linear system coefficients:
+
+    wall_clock.reset();
+
+    fill( jacobian.coefficients.dimension_0(), 0 , jacobian.coefficients );
+    fill( residual.dimension_0() , 0 , residual );
+
+    GatherFillFunctor::apply( jacobian ,
+                              residual ,
+                              mesh ,
+                              element_map ,
+                              elem_matrices ,
+                              elem_vectors );
+
+    execution_space::fence();
+    perf_data.matrix_gather_fill_time += comm::max( machine , wall_clock.seconds() );
+
+    // Apply boundary conditions:
+
+    wall_clock.reset();
+
+    // Updates jacobian matrix to 1 on the diagonal, zero elsewhere,
+    // and 0 in the residual due to the solution vector having the correct value
+    DirichletResidualFunctor::apply( jacobian, residual, mesh ,
+                                     exact_solution.zmin ,
+                                     exact_solution.zmax );
+
+    execution_space::fence();
+    perf_data.matrix_boundary_condition_time +=
+      comm::max( machine , wall_clock.seconds() );
+
+    //------------------------------------
+    // Has the residual converged?
+
+    residual_norm = norm2( mesh.parallel_data_map.count_owned,
+                           residual,
+                           mesh.parallel_data_map.machine );
+
+    if ( 0 == newton_iteration_count ) {
+      residual_norm_init = residual_norm ;
+    }
+
+    if ( residual_norm / residual_norm_init < newton_tolerance ) {
+      break ;
+    }
+
+    //------------------------------------
+    // Solve linear sytem
+
+    size_t cg_iteration_count = 0 ;
+    double cg_residual_norm = 0 ;
+
+    cgsolve( mesh.parallel_data_map ,
+             jacobian , residual , delta ,
+             cg_iteration_count ,
+             cg_residual_norm ,
+             cg_iteration_time ,
+             cg_iteration_limit , cg_tolerance ) ;
+
+    perf_data.cg_iteration_time += cg_iteration_time ;
+    cg_iteration_count_total += cg_iteration_count ;
+
+    // Update non-linear solution with delta...
+    // delta is : - Dx = [Jacobian]^1 * Residual which is the negative update
+    // LaTeX:
+    // \vec {x}_{n+1} = \vec {x}_{n} - ( - \Delta \vec{x}_{n} )
+    // text:
+    // x[n+1] = x[n] + Dx
+
+    axpy( mesh.parallel_data_map.count_owned ,
+          -1.0, delta, nodal_solution);
+
+    ++newton_iteration_count ;
+
+    if ( newton_iteration_limit < newton_iteration_count ) {
+      break ;
+    }
+  };
+
+  if ( newton_iteration_count ) {
+    perf_data.elem_time /= newton_iteration_count ;
+    perf_data.matrix_gather_fill_time /= newton_iteration_count ;
+    perf_data.matrix_boundary_condition_time /= newton_iteration_count ;
+  }
+
+  if ( cg_iteration_count_total ) {
+    perf_data.cg_iteration_time /= cg_iteration_count_total ;
+  }
+
+  perf_data.newton_iteration_count = newton_iteration_count ;
+  perf_data.cg_iteration_count = cg_iteration_count_total ;
+
+  //------------------------------------
+
+  {
+    // For extracting the nodal solution and its coordinates:
+
+    typename mesh_type::node_coords_type::HostMirror node_coords_host =
+      Kokkos::create_mirror( mesh.node_coords );
+
+    typename vector_type::HostMirror nodal_solution_host =
+      Kokkos::create_mirror( nodal_solution );
+
+    Kokkos::deep_copy( node_coords_host , mesh.node_coords );
+    Kokkos::deep_copy( nodal_solution_host , nodal_solution );
+
+    double tmp = 0 ;
+
+    for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
+      const coordinate_scalar_type x = node_coords_host(i,0);
+      const coordinate_scalar_type y = node_coords_host(i,1);
+      const coordinate_scalar_type z = node_coords_host(i,2);
+
+      const double Tx = exact_solution(z);
+      const double Ts = nodal_solution_host(i);
+      const double Te = std::abs( Tx - Ts ) / std::abs( Tx );
+
+      tmp = std::max( tmp , Te );
+
+      if ( print_error && 0.02 < Te ) {
+        std::cout << "  node( " << x << " " << y << " " << z << " ) = "
+                  << Ts << " != exact_solution " << Tx
+                  << std::endl ;
+      }
+    }
+    perf_data.error_max = comm::max( machine , tmp );
+  }
+
+  return perf_data ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class Device , class FixtureElement >
+void driver( const char * const label ,
+             comm::Machine machine ,
+             const int gang_count ,
+             const int elem_count_beg ,
+             const int elem_count_end ,
+             const int runs )
+{
+  typedef Scalar          scalar_type ;
+  typedef Device          execution_space ;
+  typedef double          coordinate_scalar_type ;
+  typedef FixtureElement  fixture_element_type ;
+
+  typedef BoxMeshFixture< coordinate_scalar_type ,
+                          execution_space ,
+                          fixture_element_type > fixture_type ;
+
+  typedef typename fixture_type::FEMeshType mesh_type ;
+
+  const size_t proc_count = comm::size( machine );
+  const size_t proc_rank  = comm::rank( machine );
+
+  if ( elem_count_beg == 0 || elem_count_end == 0 || runs == 0 ) return ;
+
+  if ( comm::rank( machine ) == 0 ) {
+    std::cout << std::endl ;
+    std::cout << "\"Kokkos::HybridFE::Nonlinear " << label << "\"" << std::endl;
+    std::cout
+      << "\"Size\" ,  \"Size\" ,  \"Graphing\" , \"Element\" ,  \"Fill\" ,     \"Boundary\" , \"CG-Iter\" , \"CG-Iter\" ,      \"Newton-Iter\" , \"Max-node-error\""
+      << std::endl
+      << "\"elems\" , \"nodes\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"total-count\" , \"total-count\" , \"ratio\""
+      << std::endl ;
+  }
+
+  const bool print_sample = 0 ;
+  const double x_curve = 1.0 ;
+  const double y_curve = 1.0 ;
+  const double z_curve = 0.8 ;
+
+  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
+  {
+    const int ix = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
+    const int iy = 1 + ix ;
+    const int iz = 2 * iy ;
+    const int global_elem_count = ix * iy * iz ;
+    const int global_node_count = ( 2 * ix + 1 ) *
+                                  ( 2 * iy + 1 ) *
+                                  ( 2 * iz + 1 );
+
+    mesh_type mesh =
+      fixture_type::create( proc_count , proc_rank , gang_count ,
+                            ix , iy , iz ,
+                            x_curve , y_curve , z_curve );
+
+    mesh.parallel_data_map.machine = machine ;
+
+
+    PerformanceData perf_data , perf_best ;
+
+    for(int j = 0; j < runs; j++){
+
+      perf_data = run<scalar_type,fixture_type>(mesh,ix,iy,iz, print_sample );
+
+      if( j == 0 ) {
+        perf_best = perf_data ;
+      }
+      else {
+        perf_best.best( perf_data );
+      }
+    }
+
+    if ( comm::rank( machine ) == 0 ) {
+
+      std::cout << std::setw(8) << global_elem_count << " , "
+                << std::setw(8) << global_node_count << " , "
+                << std::setw(10) << perf_best.graph_time * 1000 << " , "
+                << std::setw(10) << perf_best.elem_time * 1000 << " , "
+                << std::setw(10) << perf_best.matrix_gather_fill_time * 1000 << " , "
+                << std::setw(10) << perf_best.matrix_boundary_condition_time * 1000 << " , "
+                << std::setw(10) << perf_best.cg_iteration_time * 1000 << " , "
+                << std::setw(7) << perf_best.cg_iteration_count << " , "
+                << std::setw(3) << perf_best.newton_iteration_count << " , "
+                << std::setw(10) << perf_best.error_max
+                << std::endl ;
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Nonlinear */
+} /* namespace HybridFEM */
+
+
+#endif /* #ifndef HYBRIDFEM_IMPLICIT_HPP */
+
diff --git a/packages/kokkos/example/multi_fem/NonlinearElement_Cuda.hpp b/packages/kokkos/example/multi_fem/NonlinearElement_Cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f6e94520477f214851664a9114df14e59145c8a
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/NonlinearElement_Cuda.hpp
@@ -0,0 +1,390 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+#include <cstdio>
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <cstdlib>
+#include <cmath>
+
+#include <Kokkos_Core.hpp>
+#include <HexElement.hpp>
+#include <FEMesh.hpp>
+
+namespace HybridFEM {
+namespace Nonlinear {
+
+template< class MeshType , typename ScalarType > struct ElementComputation ;
+
+//----------------------------------------------------------------------------
+
+template<>
+struct ElementComputation< FEMesh< double , 27 , Kokkos::Cuda > , double >
+{
+  typedef Kokkos::Cuda    execution_space ;
+
+  static const unsigned ElementNodeCount = 27 ;
+
+  typedef HexElement_Data< ElementNodeCount >                element_data_type ;
+  typedef FEMesh< double , ElementNodeCount , execution_space >  mesh_type ;
+
+  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
+  static const unsigned FunctionCount    = element_data_type::function_count ;
+  static const unsigned IntegrationCount = element_data_type::integration_count ;
+  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
+
+  typedef Kokkos::View< double[][FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
+  typedef Kokkos::View< double[][FunctionCount] , execution_space > elem_vectors_type ;
+  typedef Kokkos::View< double[] , execution_space > value_vector_type ;
+
+private:
+
+  const element_data_type                       elem_data ;
+  const typename mesh_type::elem_node_ids_type  elem_node_ids ;
+  const typename mesh_type::node_coords_type    node_coords ;
+  const value_vector_type                       nodal_values ;
+  const elem_matrices_type                      element_matrices ;
+  const elem_vectors_type                       element_vectors ;
+  const float                                   coeff_K ;
+  const unsigned                                elem_count ;
+        unsigned                                invJacIndex[9][4] ;
+
+  static const unsigned j11 = 0 , j12 = 1 , j13 = 2 ,
+                        j21 = 3 , j22 = 4 , j23 = 5 ,
+                        j31 = 6 , j32 = 7 , j33 = 8 ;
+
+  // Can only handle up to 16 warps:
+  static const unsigned BlockDimX = 32 ;
+  static const unsigned BlockDimY = 7 ;
+
+  struct WorkSpace {
+    double sum[ BlockDimY ][ BlockDimX ];
+
+    double  value_at_integ[ IntegrationCount ];
+    double  gradx_at_integ[ IntegrationCount ];
+    double  grady_at_integ[ IntegrationCount ];
+    double  gradz_at_integ[ IntegrationCount ];
+
+    float  spaceJac[    BlockDimY ][ 9 ];
+    float  spaceInvJac[ BlockDimY ][ 9 ];
+
+    float  detJweight[ IntegrationCount ];
+
+    float  dpsidx[ FunctionCount ][ IntegrationCount ];
+    float  dpsidy[ FunctionCount ][ IntegrationCount ];
+    float  dpsidz[ FunctionCount ][ IntegrationCount ];
+  };
+
+public:
+
+  ElementComputation ( const mesh_type          & arg_mesh ,
+                       const elem_matrices_type & arg_element_matrices ,
+                       const elem_vectors_type  & arg_element_vectors ,
+                       const value_vector_type  & arg_nodal_values ,
+                       const float                arg_coeff_K )
+  : elem_data()
+  , elem_node_ids(    arg_mesh.elem_node_ids )
+  , node_coords(      arg_mesh.node_coords )
+  , nodal_values(     arg_nodal_values )
+  , element_matrices( arg_element_matrices )
+  , element_vectors(  arg_element_vectors )
+  , coeff_K(          arg_coeff_K )
+  , elem_count(       arg_mesh.elem_node_ids.dimension_0() )
+  {
+    const unsigned jInvJ[9][4] =
+     { { j22 , j33 , j23 , j32 } ,
+       { j13 , j32 , j12 , j33 } ,
+       { j12 , j23 , j13 , j22 } ,
+
+       { j23 , j31 , j21 , j33 } ,
+       { j11 , j33 , j13 , j31 } ,
+       { j13 , j21 , j11 , j23 } ,
+
+       { j21 , j32 , j22 , j31 } ,
+       { j12 , j31 , j11 , j32 } ,
+       { j11 , j22 , j12 , j21 } };
+
+    for ( unsigned i = 0 ; i < 9 ; ++i ) {
+    for ( unsigned j = 0 ; j < 4 ; ++j ) {
+      invJacIndex[i][j] = jInvJ[i][j] ;
+    }
+    }
+
+    const unsigned shmem = sizeof(WorkSpace);
+    const unsigned grid_max = 65535 ;
+    const unsigned grid_count = std::min( grid_max , elem_count );
+
+    // For compute capability 2.x up to 1024 threads per block
+    const dim3 block( BlockDimX , BlockDimY , 1 );
+    const dim3 grid( grid_count , 1 , 1 );
+
+    Kokkos::Impl::CudaParallelLaunch< ElementComputation >( *this , grid , block , shmem );
+  }
+
+public:
+
+  //------------------------------------
+  // Sum among the threadIdx.x
+
+  template< typename Type >
+  __device__ inline static
+  void sum_x( Type & result , const double value )
+  {
+    extern __shared__ WorkSpace work_data[] ;
+
+    volatile double * const base_sum =
+      & work_data->sum[ threadIdx.y ][ threadIdx.x ] ;
+
+    base_sum[ 0] = value ;
+
+    if ( threadIdx.x < 16 ) {
+      base_sum[0] += base_sum[16];
+      base_sum[0] += base_sum[ 8];
+      base_sum[0] += base_sum[ 4];
+      base_sum[0] += base_sum[ 2];
+      base_sum[0] += base_sum[ 1];
+    }
+
+    if ( 0 == threadIdx.x ) {
+      result = base_sum[0] ;
+    }
+  }
+
+  __device__ inline static
+  void sum_x_clear()
+  {
+    extern __shared__ WorkSpace work_data[] ;
+
+    work_data->sum[ threadIdx.y ][ threadIdx.x ] = 0 ;
+  }
+
+  //------------------------------------
+  //------------------------------------
+
+  __device__ inline
+  void evaluateFunctions( const unsigned ielem ) const
+  {
+    extern __shared__ WorkSpace work_data[] ;
+
+    // Each warp (threadIdx.y) computes an integration point
+    // Each thread is responsible for a node / function.
+
+    const unsigned iFunc = threadIdx.x ;
+    const bool     hasFunc = iFunc < FunctionCount ;
+
+    //------------------------------------
+    // Each warp gathers a different variable into 'elem_mat' shared memory.
+
+    if ( hasFunc ) {
+
+      const unsigned node = elem_node_ids( ielem , iFunc );
+
+      for ( unsigned iy = threadIdx.y ; iy < 4 ; iy += blockDim.y ) {
+      switch( iy ) {
+      case 0 : work_data->sum[0][iFunc] = node_coords(node,0); break ;
+      case 1 : work_data->sum[1][iFunc] = node_coords(node,1); break ;
+      case 2 : work_data->sum[2][iFunc] = node_coords(node,2); break ;
+      case 3 : work_data->sum[3][iFunc] = nodal_values(node); break ;
+      default: break ;
+      }
+      }
+    }
+
+    __syncthreads(); // Wait for all warps to finish gathering
+
+    // now get local 'const' copies in register space:
+
+    const double x       = work_data->sum[0][ iFunc ];
+    const double y       = work_data->sum[1][ iFunc ];
+    const double z       = work_data->sum[2][ iFunc ];
+    const double dof_val = work_data->sum[3][ iFunc ];
+
+    __syncthreads(); // Wait for all warps to finish extracting
+
+    sum_x_clear(); // Make sure summation scratch is zero
+
+    //------------------------------------
+    // Each warp is now on its own computing an integration point
+    // so no further explicit synchronizations are required.
+
+    if ( hasFunc ) {
+
+      float * const J    = work_data->spaceJac[    threadIdx.y ];
+      float * const invJ = work_data->spaceInvJac[ threadIdx.y ];
+
+      for ( unsigned iInt = threadIdx.y ;
+                     iInt < IntegrationCount ; iInt += blockDim.y ) {
+
+        const float val = elem_data.values[iInt][iFunc] ;
+        const float gx  = elem_data.gradients[iInt][0][iFunc] ;
+        const float gy  = elem_data.gradients[iInt][1][iFunc] ;
+        const float gz  = elem_data.gradients[iInt][2][iFunc] ;
+
+        sum_x( J[j11], gx * x );
+        sum_x( J[j12], gx * y );
+        sum_x( J[j13], gx * z );
+
+        sum_x( J[j21], gy * x );
+        sum_x( J[j22], gy * y );
+        sum_x( J[j23], gy * z );
+
+        sum_x( J[j31], gz * x );
+        sum_x( J[j32], gz * y );
+        sum_x( J[j33], gz * z );
+
+        // Inverse jacobian, only enough parallel work for 9 threads in the warp
+
+        if ( iFunc < TensorDim ) {
+
+          invJ[ iFunc ] =
+            J[ invJacIndex[iFunc][0] ] * J[ invJacIndex[iFunc][1] ] -
+            J[ invJacIndex[iFunc][2] ] * J[ invJacIndex[iFunc][3] ] ;
+
+          // Let all threads in the warp compute determinant into a register
+
+          const float detJ = J[j11] * invJ[j11] +
+                             J[j21] * invJ[j12] +
+                             J[j31] * invJ[j13] ;
+
+          invJ[ iFunc ] /= detJ ;
+
+          if ( 0 == iFunc ) {
+            work_data->detJweight[ iInt ] = detJ * elem_data.weights[ iInt ] ;
+          }
+        }
+
+        // Transform bases gradients and compute value and gradient
+
+        const float dx = gx * invJ[j11] + gy * invJ[j12] + gz * invJ[j13];
+        const float dy = gx * invJ[j21] + gy * invJ[j22] + gz * invJ[j23];
+        const float dz = gx * invJ[j31] + gy * invJ[j32] + gz * invJ[j33];
+
+        work_data->dpsidx[iFunc][iInt] = dx ;
+        work_data->dpsidy[iFunc][iInt] = dy ;
+        work_data->dpsidz[iFunc][iInt] = dz ;
+
+        sum_x( work_data->gradx_at_integ[iInt] , dof_val * dx );
+        sum_x( work_data->grady_at_integ[iInt] , dof_val * dy );
+        sum_x( work_data->gradz_at_integ[iInt] , dof_val * dz );
+        sum_x( work_data->value_at_integ[iInt] , dof_val * val );
+      }
+    }
+
+    __syncthreads(); // All shared data must be populated at return.
+  }
+
+  __device__ inline
+  void contributeResidualJacobian( const unsigned ielem ) const
+  {
+    extern __shared__ WorkSpace work_data[] ;
+
+    sum_x_clear(); // Make sure summation scratch is zero
+
+    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$
+    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$
+
+    const unsigned iInt = threadIdx.x ;
+
+    if ( iInt < IntegrationCount ) {
+
+      const double value_at_integ = work_data->value_at_integ[ iInt ] ;
+      const double gradx_at_integ = work_data->gradx_at_integ[ iInt ] ;
+      const double grady_at_integ = work_data->grady_at_integ[ iInt ] ;
+      const double gradz_at_integ = work_data->gradz_at_integ[ iInt ] ;
+
+      const float detJweight     = work_data->detJweight[ iInt ] ;
+      const float coeff_K_detJweight = coeff_K * detJweight ;
+
+      for ( unsigned iRow = threadIdx.y ;
+                     iRow < FunctionCount ; iRow += blockDim.y ) {
+
+        const float value_row  = elem_data.values[ iInt ][ iRow ] * detJweight ;
+        const float dpsidx_row = work_data->dpsidx[ iRow ][ iInt ] * coeff_K_detJweight ;
+        const float dpsidy_row = work_data->dpsidy[ iRow ][ iInt ] * coeff_K_detJweight ;
+        const float dpsidz_row = work_data->dpsidz[ iRow ][ iInt ] * coeff_K_detJweight ;
+
+        const double res_del = dpsidx_row * gradx_at_integ +
+                               dpsidy_row * grady_at_integ +
+                               dpsidz_row * gradz_at_integ ;
+
+        const double res_val = value_at_integ * value_at_integ * value_row ;
+        const double jac_val_row = 2 * value_at_integ * value_row ;
+
+        sum_x( element_vectors( ielem , iRow ) , res_del + res_val );
+
+        for ( unsigned iCol = 0 ; iCol < FunctionCount ; ++iCol ) {
+
+          const float jac_del =
+            dpsidx_row * work_data->dpsidx[iCol][iInt] +
+            dpsidy_row * work_data->dpsidy[iCol][iInt] +
+            dpsidz_row * work_data->dpsidz[iCol][iInt] ;
+
+          const double jac_val =
+            jac_val_row * elem_data.values[ iInt ][ iCol ] ;
+
+          sum_x( element_matrices( ielem , iRow , iCol ) , jac_del + jac_val );
+        }
+      }
+    }
+
+    __syncthreads(); // All warps finish before refilling shared data
+  }
+
+  __device__ inline
+  void operator()(void) const
+  {
+    extern __shared__ WorkSpace work_data[] ;
+
+    for ( unsigned ielem = blockIdx.x ; ielem < elem_count ; ielem += gridDim.x ) {
+
+      evaluateFunctions( ielem );
+
+      contributeResidualJacobian( ielem );
+    }
+  }
+
+}; /* ElementComputation */
+
+} /* namespace Nonlinear */
+} /* namespace HybridFEM */
+
diff --git a/packages/kokkos/example/multi_fem/NonlinearFunctors.hpp b/packages/kokkos/example/multi_fem/NonlinearFunctors.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef831a569878ae494590f09d59547c88083398b8
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/NonlinearFunctors.hpp
@@ -0,0 +1,482 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_NONLINEARFUNCTORS_HPP
+#define KOKKOS_NONLINEARFUNCTORS_HPP
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <cstdlib>
+#include <cmath>
+
+namespace HybridFEM {
+namespace Nonlinear {
+
+template< class MeshType , typename ScalarType > struct ElementComputation ;
+template< class MeshType , typename ScalarType > struct DirichletSolution ;
+template< class MeshType , typename ScalarType > struct DirichletResidual ;
+
+}
+}
+
+/* A Cuda-specific specialization for the element computation functor. */
+#if defined( __CUDACC__ )
+#include <NonlinearElement_Cuda.hpp>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace HybridFEM {
+namespace Nonlinear {
+
+template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
+          typename ScalarType >
+struct ElementComputation<
+  FEMesh< ScalarCoordType , ElemNode , DeviceType > , ScalarType >
+{
+  typedef DeviceType  execution_space;
+  typedef ScalarType           scalar_type ;
+
+  static const unsigned ElementNodeCount = ElemNode ;
+
+  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
+
+  typedef HexElement_Data< ElementNodeCount > element_data_type ;
+
+  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
+  static const unsigned FunctionCount    = element_data_type::function_count ;
+  static const unsigned IntegrationCount = element_data_type::integration_count ;
+  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
+
+  typedef Kokkos::View< scalar_type[][FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
+  typedef Kokkos::View< scalar_type[][FunctionCount] , execution_space > elem_vectors_type ;
+  typedef Kokkos::View< scalar_type[] , execution_space > value_vector_type ;
+
+
+private:
+
+  const element_data_type                 elem_data ;
+  typename mesh_type::elem_node_ids_type  elem_node_ids ;
+  typename mesh_type::node_coords_type    node_coords ;
+  value_vector_type                       nodal_values ;
+  elem_matrices_type                      element_matrices ;
+  elem_vectors_type                       element_vectors ;
+  scalar_type                             coeff_K ;
+
+public:
+
+  ElementComputation( const mesh_type   & arg_mesh ,
+                      const elem_matrices_type  & arg_element_matrices ,
+                      const elem_vectors_type   & arg_element_vectors ,
+                      const value_vector_type   & arg_nodal_values ,
+	              const scalar_type   arg_coeff_K )
+  : elem_data()
+  , elem_node_ids( arg_mesh.elem_node_ids )
+  , node_coords(   arg_mesh.node_coords )
+  , nodal_values(   arg_nodal_values )
+  , element_matrices( arg_element_matrices )
+  , element_vectors( arg_element_vectors )
+  , coeff_K( arg_coeff_K )
+  {
+    const size_t elem_count = arg_mesh.elem_node_ids.dimension_0();
+
+    parallel_for( elem_count , *this );
+  }
+
+  //------------------------------------
+
+  static const unsigned FLOPS_transform_gradients =
+     /* Jacobian */           FunctionCount * TensorDim * 2 +
+     /* Inverse jacobian */   TensorDim * 6 + 6 +
+     /* Gradient transform */ FunctionCount * 15 ;
+
+  KOKKOS_INLINE_FUNCTION
+  float transform_gradients(
+    const float grad[][ FunctionCount ] , // Gradient of bases master element
+    const double x[] ,
+    const double y[] ,
+    const double z[] ,
+    float dpsidx[] ,
+    float dpsidy[] ,
+    float dpsidz[] ) const
+  {
+    enum { j11 = 0 , j12 = 1 , j13 = 2 ,
+           j21 = 3 , j22 = 4 , j23 = 5 ,
+           j31 = 6 , j32 = 7 , j33 = 8 };
+
+    // Jacobian accumulation:
+
+    double J[ TensorDim ] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
+
+    for( unsigned i = 0; i < FunctionCount ; ++i ) {
+      const double x1 = x[i] ;
+      const double x2 = y[i] ;
+      const double x3 = z[i] ;
+
+      const float g1 = grad[0][i] ;
+      const float g2 = grad[1][i] ;
+      const float g3 = grad[2][i] ;
+
+      J[j11] += g1 * x1 ;
+      J[j12] += g1 * x2 ;
+      J[j13] += g1 * x3 ;
+
+      J[j21] += g2 * x1 ;
+      J[j22] += g2 * x2 ;
+      J[j23] += g2 * x3 ;
+
+      J[j31] += g3 * x1 ;
+      J[j32] += g3 * x2 ;
+      J[j33] += g3 * x3 ;
+    }
+
+    // Inverse jacobian:
+
+    float invJ[ TensorDim ] = {
+      static_cast<float>( J[j22] * J[j33] - J[j23] * J[j32] ) ,
+      static_cast<float>( J[j13] * J[j32] - J[j12] * J[j33] ) ,
+      static_cast<float>( J[j12] * J[j23] - J[j13] * J[j22] ) ,
+
+      static_cast<float>( J[j23] * J[j31] - J[j21] * J[j33] ) ,
+      static_cast<float>( J[j11] * J[j33] - J[j13] * J[j31] ) ,
+      static_cast<float>( J[j13] * J[j21] - J[j11] * J[j23] ) ,
+
+      static_cast<float>( J[j21] * J[j32] - J[j22] * J[j31] ) ,
+      static_cast<float>( J[j12] * J[j31] - J[j11] * J[j32] ) ,
+      static_cast<float>( J[j11] * J[j22] - J[j12] * J[j21] ) };
+
+    const float detJ = J[j11] * invJ[j11] +
+                       J[j21] * invJ[j12] +
+                       J[j31] * invJ[j13] ;
+
+    const float detJinv = 1.0 / detJ ;
+
+    for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }
+
+    // Transform gradients:
+
+    for( unsigned i = 0; i < FunctionCount ; ++i ) {
+      const float g0 = grad[0][i];
+      const float g1 = grad[1][i];
+      const float g2 = grad[2][i];
+
+      dpsidx[i] = g0 * invJ[j11] + g1 * invJ[j12] + g2 * invJ[j13];
+      dpsidy[i] = g0 * invJ[j21] + g1 * invJ[j22] + g2 * invJ[j23];
+      dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33];
+    }
+
+    return detJ ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void contributeResidualJacobian(
+    const float coeff_k ,
+    const double dof_values[] ,
+    const float dpsidx[] ,
+    const float dpsidy[] ,
+    const float dpsidz[] ,
+    const float detJ ,
+    const float integ_weight ,
+    const float bases_vals[] ,
+    double elem_res[] ,
+    double elem_mat[][ FunctionCount ] ) const
+  {
+    double value_at_pt = 0 ;
+    double gradx_at_pt = 0 ;
+    double grady_at_pt = 0 ;
+    double gradz_at_pt = 0 ;
+
+    for ( unsigned m = 0 ; m < FunctionCount ; m++ ) {
+      value_at_pt += dof_values[m] * bases_vals[m] ;
+      gradx_at_pt += dof_values[m] * dpsidx[m] ;
+      grady_at_pt += dof_values[m] * dpsidy[m] ;
+      gradz_at_pt += dof_values[m] * dpsidz[m] ;
+    }
+
+    const scalar_type k_detJ_weight = coeff_k        * detJ * integ_weight ;
+    const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ;
+    const double mat_val = 2.0 * value_at_pt         * detJ * integ_weight ;
+
+    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$
+    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$
+
+    for ( unsigned m = 0; m < FunctionCount; m++) {
+      double * const mat = elem_mat[m] ;
+      const float bases_val_m = bases_vals[m];
+      const float dpsidx_m    = dpsidx[m] ;
+      const float dpsidy_m    = dpsidy[m] ;
+      const float dpsidz_m    = dpsidz[m] ;
+
+      elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt +
+                                       dpsidy_m * grady_at_pt +
+                                       dpsidz_m * gradz_at_pt ) +
+                     res_val * bases_val_m ;
+
+      for( unsigned n = 0; n < FunctionCount; n++) {
+
+        mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] +
+                                    dpsidy_m * dpsidy[n] +
+                                    dpsidz_m * dpsidz[n] ) +
+                  mat_val * bases_val_m * bases_vals[n];
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned ielem ) const
+  {
+    // Gather nodal coordinates and solution vector:
+
+    double x[ FunctionCount ] ;
+    double y[ FunctionCount ] ;
+    double z[ FunctionCount ] ;
+    double val[ FunctionCount ] ;
+
+    for ( unsigned i = 0 ; i < ElementNodeCount ; ++i ) {
+      const unsigned node_index = elem_node_ids( ielem , i );
+
+      x[i] = node_coords( node_index , 0 );
+      y[i] = node_coords( node_index , 1 );
+      z[i] = node_coords( node_index , 2 );
+
+      val[i] = nodal_values( node_index );
+    }
+
+    double elem_vec[ FunctionCount ] ;
+    double elem_mat[ FunctionCount ][ FunctionCount ] ;
+
+    for( unsigned i = 0; i < FunctionCount ; i++ ) {
+      elem_vec[i] = 0 ;
+      for( unsigned j = 0; j < FunctionCount ; j++){
+        elem_mat[i][j] = 0 ;
+      }
+    }
+
+    for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) {
+      float dpsidx[ FunctionCount ] ;
+      float dpsidy[ FunctionCount ] ;
+      float dpsidz[ FunctionCount ] ;
+
+      const float detJ =
+        transform_gradients( elem_data.gradients[i] , x , y , z ,
+                             dpsidx , dpsidy , dpsidz );
+
+      contributeResidualJacobian( coeff_K ,
+                                  val , dpsidx , dpsidy , dpsidz ,
+                                  detJ ,
+                                  elem_data.weights[i] ,
+                                  elem_data.values[i] ,
+                                  elem_vec , elem_mat );
+    }
+
+    for( unsigned i = 0; i < FunctionCount ; i++){
+      element_vectors(ielem, i) = elem_vec[i] ;
+      for( unsigned j = 0; j < FunctionCount ; j++){
+        element_matrices(ielem, i, j) = elem_mat[i][j] ;
+      }
+    }
+  }
+
+}; /* ElementComputation */
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
+          typename ScalarType >
+struct DirichletSolution<
+  FEMesh< ScalarCoordType , ElemNode , DeviceType > ,
+  ScalarType >
+{
+  typedef DeviceType  execution_space;
+
+  static const unsigned ElementNodeCount = ElemNode ;
+
+  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
+
+  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
+
+  typename mesh_type::node_coords_type node_coords ;
+
+  vector_type     solution ;
+  ScalarCoordType bc_lower_z ;
+  ScalarCoordType bc_upper_z ;
+  ScalarType      bc_lower_value ;
+  ScalarType      bc_upper_value ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned inode ) const
+  {
+
+  // Apply dirichlet boundary condition on the Solution vector.
+  // Define boundary node values to be either bc_lower_value or
+  // bc_upper_value, depending on which boundary face they lie on.
+  // Non-boundary terms will be left at their previous value.
+
+    const ScalarCoordType z = node_coords(inode,2);
+    const bool bc_lower = z <= bc_lower_z ;
+    const bool bc_upper = bc_upper_z <= z ;
+
+    if ( bc_lower || bc_upper ) {
+      const ScalarType bc_value = bc_lower ? bc_lower_value
+                                           : bc_upper_value ;
+
+      solution(inode) = bc_value ; //  set the solution vector
+    }
+  }
+
+  static void apply( const vector_type    & solution ,
+                     const mesh_type      & mesh ,
+                     const ScalarCoordType  bc_lower_z ,
+                     const ScalarCoordType  bc_upper_z ,
+                     const ScalarType       bc_lower_value ,
+                     const ScalarType       bc_upper_value )
+  {
+    DirichletSolution op ;
+    op.node_coords    = mesh.node_coords ;
+    op.solution       = solution ;
+    op.bc_lower_z     = bc_lower_z ;
+    op.bc_upper_z     = bc_upper_z ;
+    op.bc_lower_value = bc_lower_value ;
+    op.bc_upper_value = bc_upper_value ;
+    parallel_for( solution.dimension_0() , op );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
+          typename ScalarType >
+struct DirichletResidual<
+  FEMesh< ScalarCoordType , ElemNode , DeviceType > , ScalarType >
+{
+  typedef DeviceType     execution_space;
+  typedef typename execution_space::size_type  size_type ;
+
+  static const unsigned ElementNodeCount = ElemNode ;
+
+  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
+  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
+
+  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
+
+  typename mesh_type::node_coords_type node_coords ;
+  matrix_type     matrix ;
+  vector_type     rhs ;
+  ScalarCoordType bc_lower_z ;
+  ScalarCoordType bc_upper_z ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned inode ) const
+  {
+    //  Apply a dirichlet boundary condition to 'irow'
+    //  to maintain the symmetry of the original
+    //  global stiffness matrix, zero out the columns
+    //  that correspond to boundary conditions, and
+    //  adjust the load vector accordingly
+
+    const size_type iBeg = matrix.graph.row_map[inode];
+    const size_type iEnd = matrix.graph.row_map[inode+1];
+
+    const ScalarCoordType z = node_coords(inode,2);
+    const bool bc_lower = z <= bc_lower_z ;
+    const bool bc_upper = bc_upper_z <= z ;
+
+    if ( bc_lower || bc_upper ) {
+      rhs(inode) = 0 ; //  set the residual vector
+
+      //  zero each value on the row, and leave a one
+      //  on the diagonal
+
+      for( size_type i = iBeg ; i < iEnd ; i++) {
+        matrix.coefficients(i) =
+          (int) inode == matrix.graph.entries(i) ? 1 : 0 ;
+      }
+    }
+    else {
+
+      //  Find any columns that are boundary conditions.
+      //  Clear them and adjust the load vector
+
+      for( size_type i = iBeg ; i < iEnd ; i++ ) {
+        const size_type cnode = matrix.graph.entries(i) ;
+
+        const ScalarCoordType zc = node_coords(cnode,2);
+        const bool c_bc_lower = zc <= bc_lower_z ;
+        const bool c_bc_upper = bc_upper_z <= zc ;
+
+        if ( c_bc_lower || c_bc_upper ) {
+
+	   matrix.coefficients(i) = 0 ;
+        }
+      }
+    }
+  }
+
+
+  static void apply( const matrix_type & linsys_matrix ,
+                     const vector_type & linsys_rhs ,
+                     const mesh_type   & mesh ,
+                     const ScalarCoordType  bc_lower_z ,
+                     const ScalarCoordType  bc_upper_z)
+  {
+    const size_t row_count = linsys_matrix.graph.row_map.dimension_0() - 1 ;
+
+    DirichletResidual op ;
+    op.node_coords    = mesh.node_coords ;
+    op.matrix         = linsys_matrix ;
+    op.rhs            = linsys_rhs ;
+    op.bc_lower_z     = bc_lower_z ;
+    op.bc_upper_z     = bc_upper_z ;
+    parallel_for( row_count , op );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} /* namespace Nonlinear */
+} /* namespace HybridFEM */
+
+#endif /* #ifndef KOKKOS_NONLINEARFUNCTORS_HPP */
+
diff --git a/packages/kokkos/example/multi_fem/ParallelComm.hpp b/packages/kokkos/example/multi_fem/ParallelComm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..37fa6243c92cbf3adee6920d065b42cd9d2cfc89
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/ParallelComm.hpp
@@ -0,0 +1,167 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef PARALLELCOMM_HPP
+#define PARALLELCOMM_HPP
+
+//------------------------------------------------------------------------
+
+#include <Kokkos_Macros.hpp>
+
+//------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_MPI )
+
+#include <mpi.h>
+#include <string>
+
+namespace comm {
+
+struct Machine {
+  MPI_Comm mpi_comm ;
+
+  Machine() : mpi_comm( MPI_COMM_NULL ) {}
+
+  Machine( const Machine & rhs )
+    : mpi_comm( rhs.mpi_comm ) {}
+
+  Machine( MPI_Comm c ) : mpi_comm( c ) {}
+
+  static Machine init( int * argc , char *** argv )
+  {
+    MPI_Init( argc , argv );
+    return Machine( MPI_COMM_WORLD );
+  }
+
+  static void finalize() { MPI_Finalize(); }
+};
+
+inline
+unsigned  size( Machine machine )
+{
+  int np ; MPI_Comm_size( machine.mpi_comm , & np ); return np ;
+}
+
+inline
+unsigned  rank( Machine machine )
+{
+  int ip ; MPI_Comm_rank( machine.mpi_comm , & ip ); return ip ;
+}
+
+inline
+double max( Machine machine , double local )
+{
+  double global = 0;
+  MPI_Allreduce( & local , & global , 1 , MPI_DOUBLE , MPI_MAX , machine.mpi_comm );
+  return global ;
+}
+
+inline
+std::string command_line( Machine machine , const int argc , const char * const * const argv )
+{
+  std::string argline ;
+
+  if ( 0 == rank( machine ) ) {
+    for ( int i = 1 ; i < argc ; ++i ) {
+      argline.append(" ").append( argv[i] );
+    }
+  }
+
+  int length = argline.length();
+  MPI_Bcast( & length , 1 , MPI_INT , 0 , machine.mpi_comm );
+  argline.resize( length , ' ' );
+  MPI_Bcast( (void*) argline.data() , length , MPI_CHAR , 0 , machine.mpi_comm );
+
+  return argline ;
+}
+
+}
+
+#else /* ! defined( KOKKOS_ENABLE_MPI ) */
+
+#include <string>
+
+namespace comm {
+
+// Stub for non-parallel
+
+struct Machine {
+  static Machine init( int * , char *** )
+  { return Machine(); }
+
+  static void finalize() {}
+};
+
+inline
+unsigned  size( Machine ) { return 1 ; }
+
+inline
+unsigned  rank( Machine ) { return 0 ; }
+
+inline
+double max( Machine , double local )
+{ return local ; }
+
+inline
+std::string command_line( Machine machine , const int argc , const char * const * const argv )
+{
+  std::string argline ;
+
+  if ( 0 == rank( machine ) ) {
+    for ( int i = 1 ; i < argc ; ++i ) {
+      argline.append(" ").append( argv[i] );
+    }
+  }
+
+  return argline ;
+}
+
+}
+
+#endif /* ! defined( KOKKOS_ENABLE_MPI ) */
+
+//------------------------------------------------------------------------
+
+#endif /* #ifndef PARALLELCOMM_HPP */
+
+
diff --git a/packages/kokkos/example/multi_fem/ParallelDataMap.hpp b/packages/kokkos/example/multi_fem/ParallelDataMap.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..12d7add5a55b7e095827033a03730df376ae9bac
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/ParallelDataMap.hpp
@@ -0,0 +1,517 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_PARALLELDATAMAP_HPP
+#define KOKKOS_PARALLELDATAMAP_HPP
+
+#include <utility>
+#include <limits>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include <Kokkos_Core.hpp>
+#include <ParallelComm.hpp>
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+/** \brief  Parallel distributed data mapping
+ *
+ *  ordering { interior : { owned items not sent elsewhere }
+ *             send     : { owned items sent }
+ *             receive  : { not-owned items received } }
+ *
+ *  recv { { N ghosted items from process P : ( P , N ) } }
+ *
+ *  send { { N send items to process P : ( P , N ) } }
+ *
+ *  send_item { send item offsets within 'send' range }
+ */
+struct ParallelDataMap {
+  typedef View< unsigned*[2], HostSpace >  host_recv_type ;
+  typedef View< unsigned*[2], HostSpace >  host_send_type ;
+  typedef View< unsigned* ,   HostSpace >  host_send_item_type ;
+
+  comm::Machine        machine ;
+  host_recv_type       host_recv ;
+  host_send_type       host_send ;
+  host_send_item_type  host_send_item ;
+  unsigned             count_interior ;
+  unsigned             count_send ;
+  unsigned             count_owned ; // = count_interior + count_send
+  unsigned             count_receive ;
+
+  void assign( const unsigned arg_count_interior ,
+               const unsigned arg_count_owned ,
+               const unsigned arg_count_total ,
+               const unsigned arg_recv_msg ,
+               const unsigned arg_send_msg ,
+               const unsigned arg_send_count )
+  {
+    const std::string label("Kokkos::ParallelDataMap buffer");
+
+    count_interior = arg_count_interior ;
+    count_owned    = arg_count_owned ;
+    count_send     = arg_count_owned - arg_count_interior ;
+    count_receive  = arg_count_total - arg_count_owned ;
+
+    host_recv = host_recv_type( label , arg_recv_msg );
+    host_send = host_send_type( label , arg_send_msg );
+    host_send_item = host_send_item_type( label , arg_send_count );
+  }
+};
+
+//----------------------------------------------------------------------------
+//PackArray
+//----------------------------------------------------------------------------
+template< class ArrayType , class Rank = void >
+struct PackArray ;
+
+template< typename DeviceType, typename ValueType >
+struct PackArray< View< ValueType* , DeviceType > , void >
+{
+  typedef DeviceType                         execution_space ;
+  typedef typename DeviceType::size_type     size_type ;
+  typedef View< ValueType* , execution_space >  array_type ;
+  typedef View< ValueType* , execution_space >  buffer_type ;
+
+private:
+
+  buffer_type  output ;
+  array_type   input ;
+  size_type    base ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  { output[i] = input(base+i); }
+
+  inline
+  static
+  void pack( const buffer_type & arg_output ,
+             const size_type     arg_begin ,
+             const size_type     arg_count ,
+             const array_type  & arg_input )
+  {
+    PackArray op ;
+    op.output = arg_output ;
+    op.input  = arg_input ;
+    op.base   = arg_begin ;
+    parallel_for( arg_count , op );
+  }
+};
+
+template< typename DeviceType, typename ValueType , unsigned N1 >
+struct PackArray< View< ValueType*[N1] , DeviceType > , void >
+{
+  typedef DeviceType                                  execution_space ;
+  typedef typename DeviceType::size_type              size_type ;
+  typedef View< ValueType*[N1] , execution_space >       array_type ;
+  typedef View< ValueType* , execution_space >           buffer_type ;
+
+private:
+
+  buffer_type  output ;
+  array_type   input ;
+  size_type    base ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  {
+    for ( size_type j = 0 , k = i * N1 ; j < N1 ; ++j , ++k ) {
+      output[k] = input(base+i,j);
+    }
+  }
+
+  inline static
+  void pack( const buffer_type & arg_output ,
+             const size_type     arg_begin ,
+             const size_type     arg_count ,
+             const array_type  & arg_input )
+  {
+    if ( arg_count ) {
+      PackArray op ;
+      op.output = arg_output ;
+      op.input  = arg_input ;
+      op.base   = arg_begin ;
+      parallel_for( arg_count , op );
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+//UnpackArray
+//----------------------------------------------------------------------------
+template< class ArrayType , class Rank = void > struct UnpackArray ;
+
+template< typename DeviceType, typename ValueType >
+struct UnpackArray< View< ValueType* , DeviceType > , void >
+{
+  typedef DeviceType                         execution_space ;
+  typedef typename DeviceType::size_type     size_type ;
+  typedef View< ValueType* , execution_space >  array_type ;
+  typedef View< ValueType* , execution_space >  buffer_type ;
+
+private:
+
+  array_type   output ;
+  buffer_type  input ;
+  size_type    base ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  { output(base+i) = input[i]; }
+
+  inline
+  static
+  void unpack( const array_type  & arg_output ,
+               const buffer_type & arg_input ,
+               const size_type     arg_begin ,
+               const size_type     arg_count )
+  {
+    UnpackArray op ;
+    op.output = arg_output ;
+    op.input  = arg_input ;
+    op.base   = arg_begin ;
+    parallel_for( arg_count , op );
+  }
+};
+
+template< typename DeviceType, typename ValueType , unsigned N1 >
+struct UnpackArray< View< ValueType*[N1] , DeviceType > , void >
+{
+  typedef DeviceType                                  execution_space ;
+  typedef typename DeviceType::size_type              size_type ;
+  typedef View< ValueType* , execution_space >           buffer_type ;
+  typedef View< ValueType*[N1] , execution_space >       array_type ;
+
+private:
+
+  array_type   output ;
+  buffer_type  input ;
+  size_type    base ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  {
+    for ( size_type j = 0 , k = i * N1 ; j < N1 ; ++j , ++k ) {
+      output(base+i,j) = input(k);
+    }
+  }
+
+  inline
+  static
+  void unpack( const array_type  & arg_output ,
+               const buffer_type & arg_input ,
+               const size_type     arg_begin ,
+               const size_type     arg_count )
+  {
+    if ( arg_count ) {
+      UnpackArray op ;
+      op.output = arg_output ;
+      op.input  = arg_input ;
+      op.base   = arg_begin ;
+      parallel_for( arg_count , op );
+    }
+  }
+};
+//----------------------------------------------------------------------------
+template< class ValueType , class Device , class DataMap >
+class AsyncExchange ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+// Application call procedure:
+//
+// construct: AsyncExchange object
+// * pack send buffer on device
+// initiate: copy send buffer from device to host
+// * dispatch asynchronous local work
+// complete: send/receive on host, copy receive buffer to device
+// * unpack receive buffer on device
+// destroy: AsyncExchange object
+//
+//----------------------------------------------------------------------------
+
+#ifdef KOKKOS_ENABLE_MPI
+
+namespace Kokkos {
+
+template< class ValueType , class Device >
+class AsyncExchange< ValueType, Device , Kokkos::ParallelDataMap > {
+public:
+
+  typedef Device                                    execution_space ;
+  typedef Kokkos::ParallelDataMap                   data_map_type ;
+  typedef Kokkos::View< ValueType* , execution_space >  buffer_dev_type ;
+  typedef typename buffer_dev_type::HostMirror      buffer_host_type ;
+
+private:
+
+  static const int mpi_tag = 11 ;
+
+  const data_map_type  data_map ;
+  unsigned             chunk_size ;
+  unsigned             send_count_max ;
+  buffer_host_type     host_recv_buffer ;
+  buffer_host_type     host_send_buffer ;
+  buffer_host_type     send_msg_buffer ;
+  buffer_dev_type      dev_buffer ;
+  buffer_dev_type      dev_send_buffer ; // Subview for send
+  buffer_dev_type      dev_recv_buffer ; // Subview for receive
+  std::vector< MPI_Request > recv_request ;
+
+public:
+
+  const buffer_dev_type & buffer() const { return dev_buffer ; }
+
+  AsyncExchange( const data_map_type & arg_data_map ,
+                 const size_t          arg_chunk )
+  : data_map( arg_data_map )
+  , chunk_size( arg_chunk )
+  , send_count_max( 0 )
+  , host_recv_buffer()
+  , host_send_buffer()
+  , send_msg_buffer()
+  , dev_buffer()
+  , dev_send_buffer()
+  , dev_recv_buffer()
+  , recv_request()
+  {
+    const size_t send_msg_count = arg_data_map.host_send.dimension_0();
+    const size_t recv_msg_count = arg_data_map.host_recv.dimension_0();
+
+    const size_t send_msg_length = arg_chunk * arg_data_map.count_send ;
+    const size_t recv_msg_length = arg_chunk * arg_data_map.count_receive ;
+
+    for ( size_t i = 0 ; i < send_msg_count ; ++i ) {
+      send_count_max = std::max( send_count_max ,
+                                 (unsigned) arg_data_map.host_send(i,1) );
+    }
+
+    // A single shared buffer on the device can be used for
+    // send and receive message buffers.
+    dev_buffer = buffer_dev_type(
+                     std::string("AsyncExchange dev_buffer") ,
+                     std::max( send_msg_length , recv_msg_length ) );
+
+    // Total send subview of the device buffer
+    dev_send_buffer =
+      Kokkos::subview( dev_buffer , std::pair<size_t,size_t>( 0 , send_msg_length ) );
+
+    // Total receive subview of the device buffer
+    dev_recv_buffer =
+      Kokkos::subview( dev_buffer , std::pair<size_t,size_t>( 0 , recv_msg_length ) );
+
+    // Total receive message buffer on the host:
+    host_recv_buffer = buffer_host_type(
+                           std::string("AsyncExchange host_recv_buffer") ,
+                           recv_msg_length );
+
+    // Total send message buffer on the host:
+    host_send_buffer = buffer_host_type(
+                           std::string("AsyncExchange host_send_buffer") ,
+                           send_msg_length );
+
+    // Individual send message buffer on the host:
+    send_msg_buffer = buffer_host_type(
+                          std::string("AsyncExchange send_msg_buffer") ,
+                          arg_chunk * send_count_max );
+
+    // MPI asynchronous receive request handles:
+    recv_request.assign( recv_msg_count , MPI_REQUEST_NULL );
+  }
+
+  //------------------------------------------------------------------------
+
+  void setup()
+  {
+    { // Post receives:
+      const size_t recv_msg_count = data_map.host_recv.dimension_0();
+
+      ValueType * ptr = host_recv_buffer.ptr_on_device();
+
+      for ( size_t i = 0 ; i < recv_msg_count ; ++i ) {
+        const int proc  = data_map.host_recv(i,0);
+        const int count = data_map.host_recv(i,1) * chunk_size ;
+
+        MPI_Irecv( ptr , count * sizeof(ValueType) , MPI_BYTE ,
+                   proc , mpi_tag , data_map.machine.mpi_comm ,
+                   & recv_request[i] );
+
+        ptr += count ;
+      }
+    }
+
+    // Copy send buffer from the device to host memory for sending
+
+    Kokkos::deep_copy( host_send_buffer , dev_send_buffer );
+
+    // Done with the device until communication is complete.
+    // Application can dispatch asynchronous work on the device.
+  }
+
+  // Application can dispatch local work to device ...
+  // No communication progress until main thread calls 'send_receive'
+
+  void send_receive()
+  {
+    const size_t recv_msg_count = data_map.host_recv.dimension_0();
+    const size_t send_msg_count = data_map.host_send.dimension_0();
+
+    // Pack and send:
+
+    for ( size_t i = 0 , j = 0 ; i < send_msg_count ; ++i ) {
+      const int proc  = data_map.host_send(i,0);
+      const int count = data_map.host_send(i,1);
+
+      for ( int k = 0 , km = 0 ; k < count ; ++k , ++j ) {
+        const int km_end = km + chunk_size ;
+        int ki = chunk_size * data_map.host_send_item(j);
+
+        for ( ; km < km_end ; ++km , ++ki ) {
+          send_msg_buffer[km] = host_send_buffer[ki];
+        }
+      }
+
+      // MPI_Ssend blocks until
+      // (1) a receive is matched for the message and
+      // (2) the send buffer can be re-used.
+      //
+      // It is suggested that MPI_Ssend will have the best performance:
+      // http://www.mcs.anl.gov/research/projects/mpi/sendmode.html .
+
+      MPI_Ssend( send_msg_buffer.ptr_on_device(),
+                 count * chunk_size * sizeof(ValueType) , MPI_BYTE ,
+                 proc , mpi_tag , data_map.machine.mpi_comm );
+    }
+
+    // Wait for receives and verify:
+
+    for ( size_t i = 0 ; i < recv_msg_count ; ++i ) {
+      MPI_Status recv_status ;
+      int recv_which = 0 ;
+      int recv_size  = 0 ;
+
+      MPI_Waitany( recv_msg_count , & recv_request[0] ,
+                   & recv_which , & recv_status );
+
+      const int recv_proc = recv_status.MPI_SOURCE ;
+
+      MPI_Get_count( & recv_status , MPI_BYTE , & recv_size );
+
+      // Verify message properly received:
+
+      const int  expected_proc = data_map.host_recv(recv_which,0);
+      const int  expected_size = data_map.host_recv(recv_which,1) *
+                                 chunk_size * sizeof(ValueType);
+
+      if ( ( expected_proc != recv_proc ) ||
+           ( expected_size != recv_size ) ) {
+        std::ostringstream msg ;
+        msg << "AsyncExchange error:"
+            << " P" << comm::rank( data_map.machine )
+            << " received from P" << recv_proc
+            << " size "     << recv_size
+            << " expected " << expected_size
+            << " from P"    << expected_proc ;
+        throw std::runtime_error( msg.str() );
+      }
+    }
+
+    // Copy received data to device memory.
+
+    Kokkos::deep_copy( dev_recv_buffer , host_recv_buffer );
+  }
+};
+
+} // namespace Kokkos
+
+#else /* ! #ifdef KOKKOS_ENABLE_MPI */
+
+namespace Kokkos {
+
+template< class ValueType , class Device >
+class AsyncExchange< ValueType, Device , Kokkos::ParallelDataMap > {
+public:
+
+  typedef Device                                    execution_space ;
+  typedef Kokkos::ParallelDataMap                   data_map_type ;
+  typedef Kokkos::View< ValueType* , execution_space >  buffer_dev_type ;
+  typedef typename buffer_dev_type::HostMirror      buffer_host_type ;
+
+  buffer_dev_type      dev_buffer ;
+
+public:
+
+  const buffer_dev_type & buffer() const { return dev_buffer ; }
+
+  AsyncExchange( const data_map_type & , const size_t )
+  : dev_buffer()
+  { }
+
+  //------------------------------------------------------------------------
+
+  void setup() { }
+
+  void send_receive() { }
+};
+
+} // namespace Kokkos
+
+#endif /* ! #ifdef KOKKOS_ENABLE_MPI */
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_PARALLELDATAMAP_HPP */
+
+
diff --git a/packages/kokkos/example/multi_fem/ParallelMachine.cpp b/packages/kokkos/example/multi_fem/ParallelMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..49545e6a57f9e7d9e687d3dd36e8516ebe2d15cb
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/ParallelMachine.cpp
@@ -0,0 +1,178 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if 0
+
+#include <cstdlib>
+#include <cstring>
+
+#include <ParallelMachine.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#if ! defined( KOKKOS_ENABLE_MPI )
+#define MPI_COMM_NULL 0
+#endif
+
+//------------------------------------------------------------------------
+
+namespace Parallel {
+
+Machine::Machine( int * argc , char *** argv )
+  : m_mpi_comm( MPI_COMM_NULL )
+  , m_mpi_size(0)
+  , m_mpi_rank(0)
+  , m_mpi_gpu(0)
+{
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  //------------------------------------
+  // Might be using a Cuda aware version of MPI.
+  // Must select Cuda device before initializing MPI.
+  {
+    int i = 1 ;
+    for ( ; i < *argc && strcmp((*argv)[i],"mpi_cuda") ; ++i );
+
+    if ( i < *argc ) {
+      // Determine, if possible, what will be the node-local
+      // rank of the MPI process once MPI has been initialized.
+      // This rank is needed to set the Cuda device before 'mvapich'
+      // is initialized.
+
+      const char * const mvapich_local_rank = getenv("MV2_COMM_WORLD_LOCAL_RANK");
+      const char * const slurm_local_rank   = getenv("SLURM_LOCALID");
+
+      const int pre_mpi_local_rank =
+        0 != mvapich_local_rank ? atoi( mvapich_local_rank ) : (
+        0 != slurm_local_rank   ? atoi( slurm_local_rank ) : (
+        -1 ) );
+
+      if ( 0 <= pre_mpi_local_rank ) {
+
+        const int ngpu = Kokkos::Cuda::detect_device_count();
+
+        const int cuda_device_rank = pre_mpi_local_rank % ngpu ;
+
+        Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cuda_device_rank ) );
+
+        m_mpi_gpu = 1 ;
+      }
+    }
+  }
+#endif
+
+  //------------------------------------
+
+#if defined( KOKKOS_ENABLE_MPI )
+  MPI_Init( argc , argv );
+  m_mpi_comm = MPI_COMM_WORLD ;
+  MPI_Comm_size( m_mpi_comm , & m_mpi_size );
+  MPI_Comm_rank( m_mpi_comm , & m_mpi_rank );
+#endif
+
+  // Query hwloc after MPI initialization to allow MPI binding:
+  //------------------------------------
+  // Request to use host device:
+  {
+    int i = 1 ;
+    for ( ; i < *argc && strcmp((*argv)[i],"host") ; ++i );
+
+    if ( i < *argc ) {
+
+      unsigned team_count       = Kokkos::hwloc::get_available_numa_count();
+      unsigned threads_per_team = Kokkos::hwloc::get_available_cores_per_numa() *
+                                  Kokkos::hwloc::get_available_threads_per_core();
+
+      if ( i + 2 < *argc ) {
+        team_count       = atoi( (*argv)[i+1] );
+        threads_per_team = atoi( (*argv)[i+2] );
+      }
+
+      Kokkos::Threads::initialize( team_count * threads_per_team );
+    }
+  }
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  //------------------------------------
+  // Request to use Cuda device and not already initialized.
+  if ( ! m_mpi_gpu ) {
+    int i = 1 ;
+    for ( ; i < *argc && strcmp((*argv)[i],"mpi_cuda") && strcmp((*argv)[i],"cuda") ; ++i );
+
+    if ( i < *argc ) {
+
+      const int ngpu = Kokkos::Cuda::detect_device_count();
+
+      const int cuda_device_rank = m_mpi_rank % ngpu ;
+
+      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cuda_device_rank ) );
+    }
+  }
+#endif
+
+}
+
+Machine::~Machine()
+{
+  Kokkos::Threads::finalize();
+#if defined( KOKKOS_ENABLE_CUDA )
+  Kokkos::Cuda::finalize();
+#endif
+#if defined( KOKKOS_ENABLE_MPI )
+  MPI_Finalize();
+#endif
+}
+
+void Machine::print_configuration( std::ostream & msg ) const
+{
+  msg << "MPI [ " << m_mpi_rank << " / " << m_mpi_size << " ]" << std::endl ;
+  Kokkos::Threads::print_configuration( msg );
+#if defined( KOKKOS_ENABLE_CUDA )
+  Kokkos::Cuda::print_configuration( msg );
+#endif
+}
+
+}
+
+#endif /* #if 0 */
+
diff --git a/packages/kokkos/example/multi_fem/ParallelMachine.hpp b/packages/kokkos/example/multi_fem/ParallelMachine.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..14894f32138ed58814abfe69e78a515349fdc80f
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/ParallelMachine.hpp
@@ -0,0 +1,118 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#error "ParallelMachine"
+
+#ifndef PARALLELMACHINE_HPP
+#define PARALLELMACHINE_HPP
+
+//------------------------------------------------------------------------
+
+#include <iosfwd>
+
+#include <Kokkos_Core.hpp>
+
+//------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_MPI )
+#include <mpi.h>
+#else
+  typedef int MPI_Comm ;
+#endif
+
+//------------------------------------------------------------------------
+//------------------------------------------------------------------------
+
+namespace Parallel {
+
+/** \brief  Hybrid parallel machine with MPI+Kokkos::Threads or MPI+Kokkos::Cuda.
+ *
+ *  Initialization of MPI and Kokkos device has interdependencies which this
+ *  class manages.  The command line and environment variables are queried to initialize
+ *  the Threads or Cuda device:
+ *
+ *    1)  cuda               : initializes Cuda device
+ *    2)  host               : initializes Threads device with all hwloc detected cores.
+ *    3)  host #gang #worker : initializes Threads with specified
+ */
+class Machine {
+private:
+
+  MPI_Comm m_mpi_comm ;
+  int      m_mpi_size ;
+  int      m_mpi_rank ;
+  unsigned m_mpi_gpu ;
+  unsigned m_gpu_arch ;
+
+  Machine();
+  Machine( const Machine & );
+  Machine & operator = ( const Machine & );
+
+public:
+
+  /** \brief  Coordinated initialize MPI, Cuda, or Threads devices from 'main'.  */
+  Machine( int * argc , char *** argv );
+
+  ~Machine();
+
+  MPI_Comm mpi_comm() const { return m_mpi_comm ; }
+
+  int mpi_size() const { return m_mpi_size ; }
+  int mpi_rank() const { return m_mpi_rank ; }
+
+  /** \brief  If using MPI that can directly operate on GPU memory */
+  bool mpi_gpu() const { return m_mpi_gpu ; }
+
+  /** \brief  If using GPU then what architecture */
+  unsigned gpu_arch() const { return m_gpu_arch ; }
+
+  void print_configuration( std::ostream & ) const ;
+};
+
+}
+
+//------------------------------------------------------------------------
+
+#endif /* #ifndef PARALLELMACHINE_HPP */
+
+
diff --git a/packages/kokkos/example/multi_fem/SparseLinearSystem.hpp b/packages/kokkos/example/multi_fem/SparseLinearSystem.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5180e57b93aded357f37d2a2df8fcca399d31a88
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/SparseLinearSystem.hpp
@@ -0,0 +1,400 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef SPARSELINEARSYSTEM_HPP
+#define SPARSELINEARSYSTEM_HPP
+
+#include <cmath>
+#include <impl/Kokkos_Timer.hpp>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_StaticCrsGraph.hpp>
+
+#include <LinAlgBLAS.hpp>
+
+namespace Kokkos {
+
+template< typename ScalarType , class Device >
+struct CrsMatrix {
+  typedef Device      execution_space ;
+  typedef ScalarType  value_type ;
+
+  typedef StaticCrsGraph< int , execution_space , void , int >  graph_type ;
+  typedef View< value_type* , execution_space >   coefficients_type ;
+
+  graph_type         graph ;
+  coefficients_type  coefficients ;
+};
+
+//----------------------------------------------------------------------------
+
+namespace Impl {
+
+template< class Matrix , class OutputVector , class InputVector >
+struct Multiply ;
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename AScalarType ,
+          typename VScalarType ,
+          class DeviceType >
+struct Multiply< CrsMatrix<AScalarType,DeviceType> ,
+                 View<VScalarType*,DeviceType > ,
+                 View<VScalarType*,DeviceType > >
+{
+  typedef DeviceType                       execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef View<       VScalarType*, execution_space, MemoryUnmanaged >  vector_type ;
+  typedef View< const VScalarType*, execution_space, MemoryUnmanaged >  vector_const_type ;
+
+  typedef CrsMatrix< AScalarType , execution_space >    matrix_type ;
+
+private:
+
+  matrix_type        m_A ;
+  vector_const_type  m_x ;
+  vector_type        m_y ;
+
+public:
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type iRow ) const
+  {
+    const size_type iEntryBegin = m_A.graph.row_map[iRow];
+    const size_type iEntryEnd   = m_A.graph.row_map[iRow+1];
+
+    double sum = 0 ;
+
+#if defined( __INTEL_COMPILER )
+#pragma simd reduction(+:sum)
+#pragma ivdep
+    for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
+      sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
+    }
+#else
+    for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
+      sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
+    }
+#endif
+
+    m_y(iRow) = sum ;
+  }
+
+  Multiply( const matrix_type & A ,
+            const size_type nrow ,
+            const size_type , // ncol ,
+            const vector_type & x ,
+            const vector_type & y )
+    : m_A( A ), m_x( x ), m_y( y )
+  {
+    parallel_for( nrow , *this );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+template< typename AScalarType ,
+          typename VScalarType ,
+          class Device >
+class Operator {
+  typedef CrsMatrix<AScalarType,Device>  matrix_type ;
+  typedef View<VScalarType*,Device>     vector_type ;
+
+private:
+  const CrsMatrix<AScalarType,Device> A ;
+
+  ParallelDataMap                                         data_map ;
+  AsyncExchange< VScalarType , Device , ParallelDataMap > exchange ;
+
+public:
+
+  Operator( const ParallelDataMap                  & arg_data_map ,
+            const CrsMatrix<AScalarType,Device>    & arg_A )
+    : A( arg_A )
+    , data_map( arg_data_map )
+    , exchange( arg_data_map , 1 )
+    {}
+
+  void apply( const View<VScalarType*,Device>  & x ,
+              const View<VScalarType*,Device>  & y )
+  {
+    // Gather off-processor data for 'x'
+
+    PackArray< vector_type >::pack( exchange.buffer() ,
+                                    data_map.count_interior ,
+                                    data_map.count_send , x );
+
+    exchange.setup();
+
+    // If interior & boundary matrices then could launch interior multiply
+
+    exchange.send_receive();
+
+    UnpackArray< vector_type >::unpack( x , exchange.buffer() ,
+                                        data_map.count_owned ,
+                                        data_map.count_receive );
+
+    const typename Device::size_type nrow = data_map.count_owned ;
+    const typename Device::size_type ncol = data_map.count_owned +
+                                            data_map.count_receive ;
+
+    Impl::Multiply<matrix_type,vector_type,vector_type>( A, nrow, ncol, x, y);
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename AScalarType , typename VScalarType , class Device >
+void cgsolve(
+  const ParallelDataMap                 data_map ,
+  const CrsMatrix<AScalarType,Device>   A ,
+  const View<VScalarType*,Device> b ,
+  const View<VScalarType*,Device> x ,
+  size_t & iteration ,
+  double & normr ,
+  double & iter_time ,
+  const size_t maximum_iteration = 200 ,
+  const double tolerance = std::numeric_limits<VScalarType>::epsilon() )
+{
+  typedef View<VScalarType*,Device> vector_type ;
+  //typedef View<VScalarType,  Device> value_type ; // unused
+
+  const size_t count_owned = data_map.count_owned ;
+  const size_t count_total = data_map.count_owned + data_map.count_receive ;
+
+  Operator<AScalarType,VScalarType,Device> matrix_operator( data_map , A );
+
+  // Need input vector to matvec to be owned + received
+  vector_type pAll ( "cg::p" , count_total );
+
+  vector_type p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
+  vector_type r ( "cg::r" , count_owned );
+  vector_type Ap( "cg::Ap", count_owned );
+
+  /* r = b - A * x ; */
+
+  /* p  = x      */ deep_copy( p , x );
+  /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );
+  /* r  = b - Ap */ waxpby( count_owned , 1.0 , b , -1.0 , Ap , r );
+  /* p  = r      */ deep_copy( p , r );
+
+  double old_rdot = dot( count_owned , r , data_map.machine );
+
+  normr     = std::sqrt( old_rdot );
+  iteration = 0 ;
+
+  Kokkos::Timer wall_clock ;
+
+  while ( tolerance < normr && iteration < maximum_iteration ) {
+
+    /* pAp_dot = dot( p , Ap = A * p ) */
+
+    /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );
+
+    const double pAp_dot = dot( count_owned , p , Ap , data_map.machine );
+    const double alpha   = old_rdot / pAp_dot ;
+
+    /* x += alpha * p ;  */ axpy( count_owned,  alpha, p , x );
+    /* r -= alpha * Ap ; */ axpy( count_owned, -alpha, Ap, r );
+
+    const double r_dot = dot( count_owned , r , data_map.machine );
+    const double beta  = r_dot / old_rdot ;
+
+    /* p = r + beta * p ; */ xpby( count_owned , r , beta , p );
+
+    normr = std::sqrt( old_rdot = r_dot );
+    ++iteration ;
+  }
+
+  iter_time = wall_clock.seconds();
+}
+
+//----------------------------------------------------------------------------
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_CUDA )
+
+#if ( CUDA_VERSION < 6000 )
+#pragma message "cusparse_v2.h"
+#include <cusparse_v2.h>
+#else
+#pragma message "cusparse.h"
+#include <cusparse.h>
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaSparseSingleton {
+  cusparseHandle_t   handle;
+  cusparseMatDescr_t descra;
+
+  CudaSparseSingleton()
+  {
+    cusparseCreate( & handle );
+    cusparseCreateMatDescr( & descra );
+    cusparseSetMatType(       descra , CUSPARSE_MATRIX_TYPE_GENERAL );
+    cusparseSetMatIndexBase(  descra , CUSPARSE_INDEX_BASE_ZERO );
+  }
+
+  static CudaSparseSingleton & singleton();
+
+};
+
+template<>
+struct Multiply< CrsMatrix<double,Cuda> ,
+                 View<double*,Cuda > ,
+                 View<double*,Cuda > >
+{
+  typedef Cuda                                      execution_space ;
+  typedef execution_space::size_type                    size_type ;
+  typedef double                                    scalar_type ;
+  typedef View< scalar_type* , execution_space >        vector_type ;
+  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
+
+public:
+
+  Multiply( const matrix_type & A ,
+            const size_type nrow ,
+            const size_type ncol ,
+            const vector_type & x ,
+            const vector_type & y )
+  {
+    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
+    const scalar_type alpha = 1 , beta = 0 ;
+
+    cusparseStatus_t status =
+      cusparseDcsrmv( s.handle ,
+                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
+                      nrow , ncol , A.coefficients.dimension_0() ,
+                      &alpha ,
+                      s.descra ,
+                      A.coefficients.ptr_on_device() ,
+                      A.graph.row_map.ptr_on_device() ,
+                      A.graph.entries.ptr_on_device() ,
+                      x.ptr_on_device() ,
+                      &beta ,
+                      y.ptr_on_device() );
+
+    if ( CUSPARSE_STATUS_SUCCESS != status ) {
+      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
+    }
+  }
+};
+
+
+template<>
+struct Multiply< CrsMatrix<float,Cuda> ,
+                 View<float*,Cuda > ,
+                 View<float*,Cuda > >
+{
+  typedef Cuda                                      execution_space ;
+  typedef execution_space::size_type                    size_type ;
+  typedef float                                     scalar_type ;
+  typedef View< scalar_type* , execution_space >        vector_type ;
+  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
+
+public:
+
+  Multiply( const matrix_type & A ,
+            const size_type nrow ,
+            const size_type ncol ,
+            const vector_type & x ,
+            const vector_type & y )
+  {
+    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
+    const scalar_type alpha = 1 , beta = 0 ;
+
+    cusparseStatus_t status =
+      cusparseScsrmv( s.handle ,
+                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
+                      nrow , ncol , A.coefficients.dimension_0() ,
+                      &alpha ,
+                      s.descra ,
+                      A.coefficients.ptr_on_device() ,
+                      A.graph.row_map.ptr_on_device() ,
+                      A.graph.entries.ptr_on_device() ,
+                      x.ptr_on_device() ,
+                      &beta ,
+                      y.ptr_on_device() );
+
+    if ( CUSPARSE_STATUS_SUCCESS != status ) {
+      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
+    }
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef SPARSELINEARSYSTEM_HPP */
+
diff --git a/packages/kokkos/example/multi_fem/SparseLinearSystemFill.hpp b/packages/kokkos/example/multi_fem/SparseLinearSystemFill.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..980f14ac1b1c31fa72ec1e45e18d7c9f190255fa
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/SparseLinearSystemFill.hpp
@@ -0,0 +1,276 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef SPARSELINEARSYSTEMFILL_HPP
+#define SPARSELINEARSYSTEMFILL_HPP
+
+#include <vector>
+#include <algorithm>
+#include <limits>
+
+#include <FEMesh.hpp>
+#include <SparseLinearSystem.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace HybridFEM {
+
+template< class MatrixType , class MeshType ,
+          class elem_matrices_type ,
+          class elem_vectors_type > struct GatherFill ;
+
+
+template< typename ScalarType ,
+          class    DeviceType ,
+          unsigned ElemNode ,
+          typename CoordScalarType ,
+          class elem_matrices_type ,
+          class elem_vectors_type >
+struct GatherFill< 
+  Kokkos::CrsMatrix< ScalarType , DeviceType > ,
+  FEMesh< CoordScalarType , ElemNode , DeviceType > ,
+  elem_matrices_type , elem_vectors_type >
+{
+  typedef DeviceType     execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  static const size_type ElemNodeCount = ElemNode ;
+
+  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
+  typedef typename matrix_type::coefficients_type   coefficients_type ;
+  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
+  typedef Kokkos::View< size_type[][ElemNodeCount][ElemNodeCount] , execution_space >       elem_graph_type ;
+
+  typedef FEMesh< CoordScalarType , ElemNodeCount , execution_space > mesh_type ;
+  typedef typename mesh_type::node_elem_ids_type node_elem_ids_type ;
+
+private:
+
+  node_elem_ids_type  node_elem_ids ;
+  elem_graph_type     elem_graph ;
+  elem_matrices_type  elem_matrices ;
+  elem_vectors_type   elem_vectors ;
+  coefficients_type   system_coeff ;
+  vector_type         system_rhs ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type irow ) const
+  {
+    const size_type node_elem_begin = node_elem_ids.row_map[irow];
+    const size_type node_elem_end   = node_elem_ids.row_map[irow+1];
+
+    //  for each element that a node belongs to 
+
+    for ( size_type i = node_elem_begin ; i < node_elem_end ; i++ ) {
+
+      const size_type elem_id   = node_elem_ids.entries( i, 0);
+      const size_type row_index = node_elem_ids.entries( i, 1);
+
+      system_rhs(irow) += elem_vectors(elem_id, row_index);
+
+      //  for each node in a particular related element  
+      //  gather the contents of the element stiffness
+      //  matrix that belong in irow
+
+      for ( size_type j = 0 ; j < ElemNodeCount ; ++j ){
+        const size_type A_index = elem_graph( elem_id , row_index , j );
+
+        system_coeff( A_index ) += elem_matrices( elem_id, row_index, j );
+      }
+    }
+  }
+
+
+  static void apply( const matrix_type & matrix ,
+                     const vector_type & rhs ,
+                     const mesh_type   & mesh ,
+                     const elem_graph_type    & elem_graph ,
+                     const elem_matrices_type & elem_matrices ,
+                     const elem_vectors_type  & elem_vectors )
+  {
+    const size_t row_count = matrix.graph.row_map.dimension_0() - 1 ;
+    GatherFill op ;
+    op.node_elem_ids = mesh.node_elem_ids ;
+    op.elem_graph    = elem_graph ;
+    op.elem_matrices = elem_matrices ;
+    op.elem_vectors  = elem_vectors ;
+    op.system_coeff  = matrix.coefficients ;
+    op.system_rhs    = rhs ;
+
+    parallel_for( row_count , op );
+  }
+};
+
+} /* namespace HybridFEM */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace HybridFEM {
+
+template< class GraphType , class MeshType >
+struct GraphFactory {
+  typedef GraphType                         graph_type ;
+  typedef MeshType                          mesh_type ;
+  typedef typename graph_type::execution_space  execution_space ;
+  typedef typename execution_space::size_type   size_type  ;
+
+  static const unsigned ElemNodeCount = mesh_type::element_node_count ;
+
+  typedef Kokkos::View< size_type[][ElemNodeCount][ElemNodeCount] , execution_space >         element_map_type ;
+
+  static
+  void
+  create( const mesh_type & mesh ,
+          graph_type & graph ,
+          element_map_type & elem_map )
+  {
+    typename mesh_type::node_elem_ids_type::HostMirror
+      node_elem_ids = create_mirror( mesh.node_elem_ids );
+
+    typename mesh_type::elem_node_ids_type::HostMirror
+      elem_node_ids = create_mirror( mesh.elem_node_ids );
+
+    typedef typename element_map_type::HostMirror element_map_host_type ;
+
+    deep_copy( elem_node_ids , mesh.elem_node_ids );
+    deep_copy( node_elem_ids.entries , mesh.node_elem_ids.entries );
+
+    const size_t owned_node = mesh.parallel_data_map.count_owned ;
+    const size_t total_elem = mesh.elem_node_ids.dimension_0();
+
+    if ( total_elem ) {
+      elem_map = element_map_type( std::string("element_map"), total_elem );
+    }
+
+    element_map_host_type elem_map_host = create_mirror( elem_map );
+
+    //------------------------------------
+    //  Node->node mapping for the CrsMatrix graph
+
+    std::vector< std::vector< unsigned > > node_node_ids( owned_node );
+    std::vector< unsigned > node_node_begin( owned_node );
+
+    size_t offset = 0 ;
+    for ( size_t i = 0 ; i < owned_node ; ++i ) {
+      const size_t j_end = node_elem_ids.row_map[i+1];
+            size_t j     = node_elem_ids.row_map[i];
+
+      node_node_begin[i] = offset ;
+
+      std::vector< unsigned > & work = node_node_ids[i] ;
+
+      for ( ; j < j_end ; ++j ) {
+        const size_t elem_id = node_elem_ids.entries(j,0);
+        for ( size_t k = 0 ; k < ElemNodeCount ; ++k ) {
+          work.push_back( elem_node_ids( elem_id , k ) );
+        }
+      }
+
+      std::sort( work.begin() , work.end() );
+
+      work.erase( std::unique( work.begin() , work.end() ) , work.end() );
+
+      offset += work.size();
+    }
+
+    graph = Kokkos::create_staticcrsgraph< graph_type >( "node_node_ids" , node_node_ids );
+
+    //------------------------------------
+    // ( element , node_row , node_column ) -> matrix_crs_column
+
+    for ( size_t elem_id = 0 ; elem_id < total_elem ; ++elem_id ) {
+      for ( size_t i = 0 ; i < ElemNodeCount ; ++i ) {
+
+        const size_t node_row = elem_node_ids( elem_id , i );
+        const size_t node_row_begin = node_node_begin[ node_row ];
+        const std::vector< unsigned > & column = node_node_ids[ node_row ] ;
+
+        if ( owned_node <= node_row ) {
+          for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
+            elem_map_host( elem_id , i , j ) = std::numeric_limits<size_type>::max();
+          }
+        }
+        else {
+
+          for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
+            const size_type node_col = elem_node_ids( elem_id , j );
+
+            int col_search = 0 ;
+
+            for ( int len = column.size() ; 0 < len ; ) {
+
+              const int half = len >> 1;
+              const int middle = col_search + half ;
+
+              if ( column[middle] < node_col ){
+                col_search = middle + 1 ;
+                len -= half + 1 ;
+              }
+              else {
+                len = half ;
+              }
+            }
+if ( node_col != column[col_search] ) {
+  throw std::runtime_error(std::string("Failed"));
+}
+            elem_map_host( elem_id , i , j ) = col_search + node_row_begin ;
+          }
+        }
+      }
+    }
+
+    deep_copy( elem_map , elem_map_host );
+  }
+};
+
+} // namespace HybridFEM
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef SPARSELINEARSYSTEMFILL_HPP */
+
diff --git a/packages/kokkos/example/multi_fem/SparseLinearSystem_Cuda.hpp b/packages/kokkos/example/multi_fem/SparseLinearSystem_Cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8dfae4758d0dcf9e081dded8f80ff09e37f25363
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/SparseLinearSystem_Cuda.hpp
@@ -0,0 +1,164 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef SPARSELINEARSYSTEM_CUDA_HPP
+#define SPARSELINEARSYSTEM_CUDA_HPP
+
+#if defined( BUILD_FROM_CU_FILE )
+
+#include <cusparse_v2.h>
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+
+struct CudaSparseSingleton {
+  cusparseHandle_t   handle;
+  cusparseMatDescr_t descra;
+
+  CudaSparseSingleton()
+  {
+    cusparseCreate( & handle );
+    cusparseCreateMatDescr( & descra );
+    cusparseSetMatType(       descra , CUSPARSE_MATRIX_TYPE_GENERAL );
+    cusparseSetMatIndexBase(  descra , CUSPARSE_INDEX_BASE_ZERO );
+  }
+
+  static CudaSparseSingleton & singleton();
+
+};
+
+CudaSparseSingleton & CudaSparseSingleton::singleton()
+{ static CudaSparseSingleton s ; return s ; }
+
+
+template<>
+struct Multiply< CrsMatrix<double,Cuda> ,
+                 View<double*,Cuda > ,
+                 View<double*,Cuda > >
+{
+  typedef Cuda                                      execution_space ;
+  typedef execution_space::size_type                    size_type ;
+  typedef double                                    scalar_type ;
+  typedef View< scalar_type* , execution_space >        vector_type ;
+  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
+
+public:
+
+  Multiply( const matrix_type & A ,
+            const size_type nrow ,
+            const size_type ncol ,
+            const vector_type & x ,
+            const vector_type & y )
+  {
+    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
+    const scalar_type alpha = 1 , beta = 0 ;
+
+    cusparseStatus_t status =
+      cusparseDcsrmv( s.handle ,
+                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
+                      nrow , ncol , A.coefficients.dimension_0() ,
+                      &alpha ,
+                      s.descra ,
+                      A.coefficients.ptr_on_device() ,
+                      A.graph.row_map.ptr_on_device() ,
+                      A.graph.entries.ptr_on_device() ,
+                      x.ptr_on_device() ,
+                      &beta ,
+                      y.ptr_on_device() );
+
+    if ( CUSPARSE_STATUS_SUCCESS != status ) {
+      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
+    }
+  }
+};
+
+
+template<>
+struct Multiply< CrsMatrix<float,Cuda> ,
+                 View<float*,Cuda > ,
+                 View<float*,Cuda > >
+{
+  typedef Cuda                                      execution_space ;
+  typedef execution_space::size_type                    size_type ;
+  typedef float                                     scalar_type ;
+  typedef View< scalar_type* , execution_space >        vector_type ;
+  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
+
+public:
+
+  Multiply( const matrix_type & A ,
+            const size_type nrow ,
+            const size_type ncol ,
+            const vector_type & x ,
+            const vector_type & y )
+  {
+    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
+    const scalar_type alpha = 1 , beta = 0 ;
+
+    cusparseStatus_t status =
+      cusparseScsrmv( s.handle ,
+                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
+                      nrow , ncol , A.coefficients.dimension_0() ,
+                      &alpha ,
+                      s.descra ,
+                      A.coefficients.ptr_on_device() ,
+                      A.graph.row_map.ptr_on_device() ,
+                      A.graph.entries.ptr_on_device() ,
+                      x.ptr_on_device() ,
+                      &beta ,
+                      y.ptr_on_device() );
+
+    if ( CUSPARSE_STATUS_SUCCESS != status ) {
+      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
+    }
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #if defined( __CUDACC__ ) */
+#endif /* #ifndef SPARSELINEARSYSTEM_CUDA_HPP */
+
diff --git a/packages/kokkos/example/multi_fem/TestBoxMeshFixture.hpp b/packages/kokkos/example/multi_fem/TestBoxMeshFixture.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9903c998fe63aece380cb34f5cb23d0b7440a4a0
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/TestBoxMeshFixture.hpp
@@ -0,0 +1,242 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TESTFEMESHBOXFIXTURE_HPP
+#define TESTFEMESHBOXFIXTURE_HPP
+
+#include <cstdio>
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <utility>
+#include <BoxMeshFixture.hpp>
+
+#include <ParallelComm.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace TestFEMesh {
+
+template< class ViewType >
+struct VerifyUnpack  ;
+
+template< typename DeviceType, typename T >
+struct VerifyUnpack< Kokkos::View< T*[3] , DeviceType > >
+{
+  typedef DeviceType     execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+  typedef size_type               value_type ;
+
+  typedef Kokkos::View< T* ,    execution_space > buffer_type ;
+  typedef Kokkos::View< T*[3] , execution_space > array_type ;
+
+private:
+
+  array_type  node_coords ;
+  buffer_type buffer ;
+  size_type   node_begin ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+  { update += source ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i , value_type & update ) const
+  {
+    const size_type node_id = i + node_begin ;
+    const size_type k = i * 3 ;
+
+    const long xb = buffer[k];
+    const long yb = buffer[k+1];
+    const long zb = buffer[k+2];
+    const long xn = node_coords(node_id,0);
+    const long yn = node_coords(node_id,1);
+    const long zn = node_coords(node_id,2);
+
+    if ( xb != xn || yb != yn || zb != zn ) {
+      printf("TestFEMesh::VerifyUnpack failed at %d : node %d : { %ld %ld %ld } != { %ld %ld %ld }\n",
+             (int)i,(int)node_id, xb,yb,zb, xn, yn, zn );
+      ++update ;
+    }
+  }
+
+  static inline
+  size_type unpack( const array_type  & arg_node_coords ,
+                    const size_type     arg_node_begin ,
+                    const size_type     arg_node_count ,
+                    const buffer_type & arg_buffer )
+  {
+    VerifyUnpack op ;
+    op.node_coords = arg_node_coords ;
+    op.buffer      = arg_buffer ;
+    op.node_begin  = arg_node_begin ;
+    size_type count = 0 ;
+    Kokkos::parallel_reduce( arg_node_count , op , count );
+    return count ;
+  }
+};
+
+}
+
+//----------------------------------------------------------------------------
+
+#ifdef KOKKOS_ENABLE_MPI
+
+namespace TestFEMesh {
+
+template< typename coordinate_scalar_type ,
+          unsigned ElemNodeCount ,
+          class Device >
+void verify_parallel(
+  const HybridFEM::FEMesh< coordinate_scalar_type ,
+                           ElemNodeCount ,
+                           Device > & mesh )
+{
+  typedef HybridFEM::FEMesh< coordinate_scalar_type, ElemNodeCount, Device > femesh_type ;
+  typedef typename femesh_type::node_coords_type node_coords_type ;
+
+  comm::Machine machine = mesh.parallel_data_map.machine ;
+
+  // Communicate node coordinates to verify communication and setup.
+
+  const size_t chunk_size = 3 ;
+
+  Kokkos::AsyncExchange< coordinate_scalar_type, Device, Kokkos::ParallelDataMap >
+    exchange( mesh.parallel_data_map , chunk_size );
+
+  const size_t send_begin = mesh.parallel_data_map.count_interior ;
+  const size_t send_count = mesh.parallel_data_map.count_send ;
+
+  const size_t recv_begin = mesh.parallel_data_map.count_owned ;
+  const size_t recv_count = mesh.parallel_data_map.count_receive ;
+
+  typedef Kokkos::PackArray< node_coords_type > pack_type ;
+
+  pack_type::pack( exchange.buffer(), send_begin, send_count, mesh.node_coords );
+
+  exchange.setup();
+
+  // Launch local-action device kernels
+
+  exchange.send_receive();
+
+  unsigned long local[3] ;
+  local[0] = mesh.parallel_data_map.count_owned ;
+  local[1] = mesh.parallel_data_map.count_receive ;
+  local[2] = TestFEMesh::VerifyUnpack< node_coords_type >::unpack( mesh.node_coords, recv_begin, recv_count, exchange.buffer() );
+
+  unsigned long global[3] = { 0 , 0 , 0 };
+
+  MPI_Allreduce( local , global ,
+                 3 , MPI_UNSIGNED_LONG , MPI_SUM , machine.mpi_comm );
+
+  if ( 0 == comm::rank( machine ) ) {
+    std::cout << ( global[2] ? "FAILED" : "PASSED" )
+              << ": TestFEMesh::verify_parallel "
+              << "NP(" << comm::size( machine )
+              << ") total_node(" << global[0]
+              << ") verified_nodes(" << global[1]
+              << ") failed_nodes(" << global[2]
+              << ")" << std::endl ;
+  }
+}
+
+} // namespace TestFEMesh
+
+#else /* ! #ifdef KOKKOS_ENABLE_MPI */
+
+namespace TestFEMesh {
+
+template< typename coordinate_scalar_type ,
+          unsigned ElemNodeCount ,
+          class Device >
+void verify_parallel(
+  const HybridFEM::FEMesh< coordinate_scalar_type ,
+                           ElemNodeCount ,
+                           Device > & )
+{}
+
+} // namespace TestFEMesh
+
+#endif /* ! #ifdef KOKKOS_ENABLE_MPI */
+
+//----------------------------------------------------------------------------
+
+template< class Device >
+void test_box_fixture( comm::Machine machine ,
+                       const size_t gang_count ,
+                       const size_t nodes_nx ,
+                       const size_t nodes_ny ,
+                       const size_t nodes_nz )
+{
+  typedef long                coordinate_scalar_type ;
+  typedef FixtureElementHex8  fixture_element_type ;
+
+  typedef BoxMeshFixture< coordinate_scalar_type ,
+                          Device ,
+                          fixture_element_type > fixture_type ;
+
+  typedef typename fixture_type::FEMeshType  mesh_type ;
+
+  const size_t proc_count = comm::size( machine );
+  const size_t proc_local = comm::rank( machine ) ;
+
+  mesh_type mesh =
+    fixture_type::create( proc_count , proc_local , gang_count ,
+                          nodes_nx - 1 , nodes_ny - 1 , nodes_nz - 1 );
+
+  mesh.parallel_data_map.machine = machine ;
+
+  TestFEMesh::verify_parallel( mesh );
+}
+
+#endif /* #ifndef TESTFEMESHBOXFIXTURE_HPP */
+
+
diff --git a/packages/kokkos/example/multi_fem/TestBoxMeshPartition.cpp b/packages/kokkos/example/multi_fem/TestBoxMeshPartition.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..93778c85841dfa81a62354b9ff6598e86b235e3d
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/TestBoxMeshPartition.cpp
@@ -0,0 +1,172 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <utility>
+#include <BoxMeshPartition.hpp>
+
+//----------------------------------------------------------------------------
+
+void test_box_partition( bool print )
+{
+  const size_t np_max = 10000 ;
+
+  const BoxBoundsLinear use_box ;
+
+  BoxType root_box ;
+
+  root_box[0][0] = 0 ; root_box[0][1] = 100 ;
+  root_box[1][0] = 0 ; root_box[1][1] = 200 ;
+  root_box[2][0] = 0 ; root_box[2][1] = 300 ;
+
+  const size_t cell_total =
+    ( root_box[0][1] - root_box[0][0] ) *
+    ( root_box[1][1] - root_box[1][0] ) *
+    ( root_box[2][1] - root_box[2][0] );
+
+  for ( size_t np = 2 ; np < np_max ; np = 2 * ( np + 1 ) ) {
+
+    std::vector<BoxType> part_boxes( np );
+
+    box_partition_rcb( root_box , part_boxes );
+
+    size_t cell_goal = ( cell_total + np - 1 ) / np ;
+    size_t cell_max = 0 ;
+
+    for ( size_t i = 0 ; i < np ; ++i ) {
+      cell_max = std::max( cell_max , count( part_boxes[i] ) );
+    }
+
+    if ( print ) {
+      std::cout << std::endl
+                << "box_part( " << np 
+                << " ) max( " << cell_max
+                << " ) goal( " << cell_goal
+                << " ) ratio( " << double(cell_max) / double(cell_goal)
+                << " )" << std::endl ;
+    }
+
+    const size_t nsample = std::min(np,(size_t)4);
+    const size_t stride = ( np + nsample - 1 ) / nsample ;
+
+    for ( size_t my_part = 0 ; my_part < np ; my_part += stride ) {
+      BoxType             my_use_box ;
+      std::vector<size_t> my_use_id_map ;
+      size_t              my_count_interior ;
+      size_t              my_count_owned ;
+      size_t              my_count_uses ;
+      std::vector<size_t> my_recv_counts ;
+      std::vector<std::vector<size_t> > my_send_map ;
+
+      size_t count_verify = 0 ;
+
+      box_partition_maps( root_box , part_boxes ,
+                          use_box , my_part ,
+                          my_use_box , my_use_id_map ,
+                          my_count_interior ,
+                          my_count_owned ,
+                          my_count_uses ,
+                          my_recv_counts ,
+                          my_send_map );
+
+      count_verify = my_count_owned ;
+
+      if ( print ) {
+        std::cout << "  my_part(" << my_part << ") layout { "
+                  << "P" << my_part
+                  << "(" << my_count_interior
+                  << "," << ( my_count_owned - my_count_interior )
+                  << ")" ;
+      }
+
+      for ( size_t i = 1 ; i < np ; ++i ) {
+        if ( my_recv_counts[i] ) {
+          count_verify += my_recv_counts[i] ;
+          const size_t ip = ( my_part + i ) % np ;
+
+          if ( print ) {
+            std::cout << " P" << ip << "(" << my_recv_counts[i] << ")" ;
+          }
+
+          // Compare recv & send lists
+
+          BoxType             ip_use_box ;
+          std::vector<size_t> ip_use_id_map ;
+          size_t              ip_count_interior ;
+          size_t              ip_count_owned ;
+          size_t              ip_count_uses ;
+          std::vector<size_t> ip_recv_counts ;
+          std::vector<std::vector<size_t> > ip_send_map ;
+
+          box_partition_maps( root_box , part_boxes ,
+                              use_box , ip ,
+                              ip_use_box , ip_use_id_map ,
+                              ip_count_interior ,
+                              ip_count_owned ,
+                              ip_count_uses ,
+                              ip_recv_counts ,
+                              ip_send_map );
+
+          // Sent by ip, received by my_part:
+
+          const BoxType recv_send = intersect( part_boxes[ip] , my_use_box );
+          const size_t recv_send_count = count( recv_send );
+
+          const size_t j = ( my_part + np - ip ) % np ;
+
+          if ( recv_send_count != my_recv_counts[i] ||
+               recv_send_count != ip_send_map[j].size() ) {
+            throw std::runtime_error( std::string("bad recv/send map") );
+          }
+        }
+      }
+      if ( print ) { std::cout << " }" << std::endl ; }
+
+      if ( count_verify != my_count_uses ) {
+        throw std::runtime_error( std::string("bad partition map") );
+      }
+    }
+  }
+}
+
+
diff --git a/packages/kokkos/example/multi_fem/TestCuda.cpp b/packages/kokkos/example/multi_fem/TestCuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b6b739050a2394c84577f52967413c69595b7ca
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/TestCuda.cpp
@@ -0,0 +1,192 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#include <TestBoxMeshFixture.hpp>
+#include <Implicit.hpp>
+#include <Nonlinear.hpp>
+#include <Explicit.hpp>
+
+#include <SparseLinearSystem.hpp>
+
+#if defined( KOKKOS_ENABLE_CUDA )
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+CudaSparseSingleton & CudaSparseSingleton::singleton()
+{ static CudaSparseSingleton s ; return s ; }
+
+}
+}
+
+//----------------------------------------------------------------------------
+
+void test_cuda_query( comm::Machine machine )
+{
+  const size_t comm_rank = comm::rank( machine );
+  std::cout << "P" << comm_rank
+            << ": Cuda device_count = "
+            << Kokkos::Cuda::detect_device_count()
+            << std::endl ;
+}
+
+//----------------------------------------------------------------------------
+
+void test_cuda_fixture( comm::Machine machine ,
+                        size_t nx , size_t ny , size_t nz )
+{
+  const size_t comm_rank = comm::rank( machine );
+  const size_t comm_size = comm::size( machine );
+  const size_t dev_count = Kokkos::Cuda::detect_device_count();
+  const size_t dev_rank =
+    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
+  const size_t gang_count = 0 ;
+
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::SelectDevice select_device( dev_rank );
+  Kokkos::Cuda::initialize( select_device );
+  test_box_fixture<Kokkos::Cuda>( machine , gang_count , nx , ny , nz );
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+void test_cuda_implicit( comm::Machine machine , 
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run )
+{
+  const size_t comm_rank = comm::rank( machine );
+  const size_t comm_size = comm::size( machine );
+  const size_t dev_count = Kokkos::Cuda::detect_device_count();
+  const size_t dev_rank =
+    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
+  const size_t gang_count = 0 ;
+
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::SelectDevice select_device( dev_rank );
+  Kokkos::Cuda::initialize( select_device );
+  HybridFEM::Implicit::driver<double,Kokkos::Cuda>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+void test_cuda_explicit( comm::Machine machine , 
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run )
+{
+  const size_t comm_rank = comm::rank( machine );
+  const size_t comm_size = comm::size( machine );
+  const size_t dev_count = Kokkos::Cuda::detect_device_count();
+  const size_t dev_rank =
+    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
+  const size_t gang_count = 0 ;
+
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::SelectDevice select_device( dev_rank );
+  Kokkos::Cuda::initialize( select_device );
+  Explicit::driver<double,Kokkos::Cuda>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+void test_cuda_nonlinear( comm::Machine machine , 
+                          size_t elem_count_begin ,
+                          size_t elem_count_end ,
+                          size_t count_run )
+{
+  const size_t comm_rank = comm::rank( machine );
+  const size_t comm_size = comm::size( machine );
+  const size_t dev_count = Kokkos::Cuda::detect_device_count();
+  const size_t dev_rank =
+    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
+  const size_t gang_count = 0 ;
+
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::SelectDevice select_device( dev_rank );
+  Kokkos::Cuda::initialize( select_device );
+
+  typedef Kokkos::Cuda device ;
+  typedef FixtureElementHex8 hex8 ;
+  HybridFEM::Nonlinear::driver<double,device,hex8>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+void test_cuda_nonlinear_quadratic( comm::Machine machine , 
+                                    size_t elem_count_begin ,
+                                    size_t elem_count_end ,
+                                    size_t count_run )
+{
+  const size_t comm_rank = comm::rank( machine );
+  const size_t comm_size = comm::size( machine );
+  const size_t dev_count = Kokkos::Cuda::detect_device_count();
+  const size_t dev_rank =
+    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
+  const size_t gang_count = 0 ;
+
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::SelectDevice select_device( dev_rank );
+  Kokkos::Cuda::initialize( select_device );
+
+  typedef Kokkos::Cuda device ;
+  typedef FixtureElementHex27 hex27 ;
+  HybridFEM::Nonlinear::driver<double,device,hex27>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+#endif  /* #if defined( KOKKOS_ENABLE_CUDA ) */
+
diff --git a/packages/kokkos/example/multi_fem/TestHost.cpp b/packages/kokkos/example/multi_fem/TestHost.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d12ac837ca79b29e09f7fbb4d2ff3b1c70de9a80
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/TestHost.cpp
@@ -0,0 +1,137 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+// Must be included first on Intel-Phi systems due to
+// redefinition of SEEK_SET in <mpi.h>.
+
+#include <ParallelComm.hpp>
+
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <utility>
+
+//----------------------------------------------------------------------------
+
+#include <Kokkos_Core.hpp>
+
+#include <BoxMeshFixture.hpp>
+#include <TestBoxMeshFixture.hpp>
+#include <Implicit.hpp>
+#include <Nonlinear.hpp>
+#include <Explicit.hpp>
+#include <SparseLinearSystem.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+void test_host_fixture( comm::Machine machine ,
+                        size_t gang_count ,
+                        size_t gang_worker_count ,
+                        size_t nx , size_t ny , size_t nz )
+{
+  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
+  test_box_fixture<Kokkos::HostSpace::execution_space>( machine , gang_count , nx , ny , nz );
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+void test_host_implicit( comm::Machine machine ,
+                         size_t gang_count ,
+                         size_t gang_worker_count ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run )
+{
+  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
+  HybridFEM::Implicit::driver<double,Kokkos::HostSpace::execution_space>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+void test_host_explicit( comm::Machine machine ,
+                         size_t gang_count ,
+                         size_t gang_worker_count ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run )
+{
+  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
+  Explicit::driver<double,Kokkos::HostSpace::execution_space>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+void test_host_nonlinear( comm::Machine machine ,
+                          size_t gang_count ,
+                          size_t gang_worker_count ,
+                          size_t elem_count_begin ,
+                          size_t elem_count_end ,
+                          size_t count_run )
+{
+  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
+  typedef FixtureElementHex8 hex8 ;
+  typedef Kokkos::HostSpace::execution_space             device ;
+  HybridFEM::Nonlinear::driver<double,device,hex8>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+void test_host_nonlinear_quadratic( comm::Machine machine ,
+                                    size_t gang_count ,
+                                    size_t gang_worker_count ,
+                                    size_t elem_count_begin ,
+                                    size_t elem_count_end ,
+                                    size_t count_run )
+{
+  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
+  typedef FixtureElementHex27 hex27 ;
+  typedef Kokkos::HostSpace::execution_space              device ;
+  HybridFEM::Nonlinear::driver<double,device,hex27>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+
diff --git a/packages/kokkos/example/multi_fem/TestHybridFEM.cpp b/packages/kokkos/example/multi_fem/TestHybridFEM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0be4e69973a6daed75cc5a110483390ecc227889
--- /dev/null
+++ b/packages/kokkos/example/multi_fem/TestHybridFEM.cpp
@@ -0,0 +1,348 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+// Must be included first on Intel-Phi systems due to
+// redefinition of SEEK_SET in <mpi.h>.
+
+#include <ParallelComm.hpp>
+
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <Kokkos_hwloc.hpp>
+
+//----------------------------------------------------------------------------
+
+void test_box_partition( bool print );
+
+//----------------------------------------------------------------------------
+
+void test_host_fixture( comm::Machine machine ,
+                        size_t gang_count ,
+                        size_t gang_worker_count ,
+                        size_t nx , size_t ny , size_t nz );
+
+void test_host_implicit( comm::Machine machine ,
+                         size_t gang_count ,
+                         size_t gang_worker_count ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run );
+
+void test_host_explicit( comm::Machine machine ,
+                         size_t gang_count ,
+                         size_t gang_worker_count ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run );
+
+void test_host_nonlinear( comm::Machine machine ,
+                          size_t gang_count ,
+                          size_t gang_worker_count ,
+                          size_t elem_count_begin ,
+                          size_t elem_count_end ,
+                          size_t count_run );
+
+void test_host_nonlinear_quadratic( comm::Machine machine ,
+                                    size_t gang_count ,
+                                    size_t gang_worker_count ,
+                                    size_t elem_count_begin ,
+                                    size_t elem_count_end ,
+                                    size_t count_run );
+
+
+//----------------------------------------------------------------------------
+
+void test_cuda_query( comm::Machine );
+
+void test_cuda_fixture( comm::Machine machine ,
+                        size_t nx , size_t ny , size_t nz );
+
+void test_cuda_implicit( comm::Machine machine ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run );
+
+void test_cuda_explicit( comm::Machine machine ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run );
+
+void test_cuda_nonlinear( comm:: Machine machine ,
+                          size_t elem_count_begin ,
+                          size_t elem_count_end ,
+                          size_t count_run );
+
+void test_cuda_nonlinear_quadratic( comm::Machine machine ,
+                                    size_t elem_count_begin ,
+                                    size_t elem_count_end ,
+                                    size_t count_run );
+
+
+//----------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace {
+
+bool run_host( std::istream & input ,
+               comm::Machine machine ,
+               const size_t host_gang_count ,
+               const size_t host_gang_worker_count )
+{
+  bool cmd_error = false ;
+
+  std::string which ; input >> which ;
+
+  if ( which == std::string("fixture") ) {
+
+    size_t nx = 0 , ny = 0 , nz = 0 ;
+    input >> nx >> ny >> nz ;
+    test_host_fixture( machine , host_gang_count , host_gang_worker_count , nx , ny , nz );
+
+  }
+  else if ( which == std::string("explicit") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_host_explicit( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("implicit") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_host_implicit( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("nonlinear") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_host_nonlinear( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("nonlinear_quadratic") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_host_nonlinear_quadratic( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else {
+    cmd_error = true ;
+  }
+
+  return cmd_error ;
+}
+
+#if defined( KOKKOS_ENABLE_CUDA )
+bool run_cuda( std::istream & input , comm::Machine machine )
+{
+  bool cmd_error = false ;
+
+  std::string which ; input >> which ;
+
+  if ( which == std::string("fixture") ) {
+
+    size_t nx = 0 , ny = 0 , nz = 0 ;
+    input >> nx >> ny >> nz ;
+    test_cuda_fixture( machine , nx , ny , nz );
+
+  }
+  else if ( which == std::string("explicit") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_cuda_explicit( machine , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("implicit") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_cuda_implicit( machine , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("nonlinear") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_cuda_nonlinear( machine , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("nonlinear_quadratic") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_cuda_nonlinear_quadratic( machine , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else {
+    cmd_error = true ;
+  }
+
+  return cmd_error ;
+}
+#endif
+
+void run( const std::string & argline , comm::Machine machine )
+{
+  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  std::istringstream input( argline );
+
+  bool cmd_error = false ;
+
+  std::string which ; input >> which ;
+
+  if ( which == std::string("query") ) {
+    std::cout << "P" << comm::rank( machine )
+              << ": hwloc { NUMA[" << numa_count << "]"
+              << " CORE[" << cores_per_numa << "]"
+              << " PU[" << threads_per_core << "] }"
+              << std::endl ;
+#if defined( KOKKOS_ENABLE_CUDA )
+    test_cuda_query( machine );
+#endif
+  }
+  else if ( which == std::string("partition") ) {
+    if ( 0 == comm::rank( machine ) ) {
+      test_box_partition( false /* print flag */ );
+    }
+  }
+  else {
+    if ( which == std::string("host") ) {
+      size_t host_gang_count = 0 ;
+      size_t host_gang_worker_count = 1 ;
+
+      input >> host_gang_count ;
+      input >> host_gang_worker_count ;
+
+      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
+    }
+    else if ( which == std::string("host-all") ) {
+      size_t host_gang_count        = numa_count ;
+      size_t host_gang_worker_count = cores_per_numa * threads_per_core ;
+
+      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
+    }
+    else if ( which == std::string("host-most") ) {
+      size_t host_gang_count        = numa_count ;
+      size_t host_gang_worker_count = ( cores_per_numa - 1 ) * threads_per_core ;
+
+      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
+    }
+#if defined( KOKKOS_ENABLE_CUDA )
+    else if ( which == std::string("cuda") ) {
+      cmd_error = run_cuda( input , machine );
+    }
+#endif
+    else {
+      cmd_error = true ;
+    }
+  }
+
+  if ( cmd_error && 0 == comm::rank( machine ) ) {
+    std::cout << "Expecting command line with" << std::endl
+              << "    query" << std::endl
+              << "    partition" << std::endl
+              << "    host NumNumaNode NumThreadPerNode <test>" << std::endl
+              << "    host-all <test>" << std::endl
+              << "    host-most <test>" << std::endl
+              << "    cuda <test>" << std::endl
+              << "where <test> is" << std::endl
+              << "    fixture   NumElemX NumElemY NumElemZ" << std::endl
+              << "    implicit  NumElemBegin NumElemEnd NumRun" << std::endl
+              << "    explicit  NumElemBegin NumElemEnd NumRun" << std::endl
+              << "    nonlinear NumElemBegin NumElemEnd NumRun" << std::endl
+              << "    nonlinear_quadratic NumElemBegin NumElemEnd NumRun" << std::endl ;
+
+  }
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+int main( int argc , char ** argv )
+{
+  comm::Machine machine = comm::Machine::init( & argc , & argv );
+
+  const unsigned comm_rank = comm::rank( machine );
+
+  const std::string argline = comm::command_line( machine , argc , argv );
+
+  try {
+    run( argline , machine );
+  }
+  catch( const std::exception & x ) {
+    std::cerr << "P" << comm_rank << " throw: " << x.what() << std::endl ;
+  }
+  catch( ... ) {
+    std::cerr << "P" << comm_rank << " throw: unknown exception" << std::endl ;
+  }
+
+  comm::Machine::finalize();
+
+  return 0 ;
+}
+
diff --git a/packages/kokkos/example/query_device/CMakeLists.txt b/packages/kokkos/example/query_device/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dade7f01fef5c935ab3e11bcffc5722ed4b9d1d5
--- /dev/null
+++ b/packages/kokkos/example/query_device/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+
+FILE(GLOB SOURCES *.cpp)
+
+TRIBITS_ADD_EXECUTABLE(
+  query_device
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
diff --git a/packages/kokkos/example/query_device/Makefile b/packages/kokkos/example/query_device/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..42b376ec7c5cf73537bf2d49340ce1ca963e3ad1
--- /dev/null
+++ b/packages/kokkos/example/query_device/Makefile
@@ -0,0 +1,46 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+  EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+else
+  CXX = g++
+  EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+endif
+
+CXXFLAGS = -O3 -I$(SRC_DIR)
+LINK ?= $(CXX)
+LDFLAGS ?=
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/packages/kokkos/example/query_device/query_device.cpp b/packages/kokkos/example/query_device/query_device.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7dd2e0c7ebdc5bb7d5c2db8856213a792a5a526b
--- /dev/null
+++ b/packages/kokkos/example/query_device/query_device.cpp
@@ -0,0 +1,100 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include <Kokkos_Macros.hpp>
+
+#if defined( KOKKOS_ENABLE_MPI )
+#include <mpi.h>
+#endif
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+int main( int argc , char ** argv )
+{
+  std::ostringstream msg ;
+
+#if defined( KOKKOS_ENABLE_MPI )
+
+  MPI_Init( & argc , & argv );
+
+  int mpi_rank = 0 ;
+
+  MPI_Comm_rank( MPI_COMM_WORLD , & mpi_rank );
+
+  msg << "MPI rank(" << mpi_rank << ") " ;
+
+#endif
+
+  msg << "{" << std::endl ;
+
+  if ( Kokkos::hwloc::available() ) {
+    msg << "hwloc( NUMA[" << Kokkos::hwloc::get_available_numa_count()
+        << "] x CORE["    << Kokkos::hwloc::get_available_cores_per_numa()
+        << "] x HT["      << Kokkos::hwloc::get_available_threads_per_core()
+        << "] )"
+        << std::endl ;
+  }
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  Kokkos::Cuda::print_configuration( msg );
+#endif
+
+  msg << "}" << std::endl ;
+
+  std::cout << msg.str();
+
+#if defined( KOKKOS_ENABLE_MPI )
+
+  MPI_Finalize();
+
+#endif
+
+  return 0 ;
+}
+
diff --git a/packages/kokkos/example/sort_array/CMakeLists.txt b/packages/kokkos/example/sort_array/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0c7da74f4a9b94dbcdb2a2dc5d192203a319b048
--- /dev/null
+++ b/packages/kokkos/example/sort_array/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+
+FILE(GLOB SOURCES *.cpp)
+
+TRIBITS_ADD_EXECUTABLE(
+  sort_array
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
diff --git a/packages/kokkos/example/sort_array/Makefile b/packages/kokkos/example/sort_array/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..42b376ec7c5cf73537bf2d49340ce1ca963e3ad1
--- /dev/null
+++ b/packages/kokkos/example/sort_array/Makefile
@@ -0,0 +1,46 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+  CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
+  EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+else
+  CXX = g++
+  EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+endif
+
+CXXFLAGS = -O3 -I$(SRC_DIR)
+LINK ?= $(CXX)
+LDFLAGS ?=
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/packages/kokkos/example/sort_array/main.cpp b/packages/kokkos/example/sort_array/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..331b2ef62d9619799308699d6a1c2c36477a8773
--- /dev/null
+++ b/packages/kokkos/example/sort_array/main.cpp
@@ -0,0 +1,95 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstring>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+
+#include <Kokkos_Core.hpp>
+
+#include <sort_array.hpp>
+
+
+int main( int argc , char ** argv )
+{
+#if defined( KOKKOS_ENABLE_CUDA ) || defined( KOKKOS_ENABLE_THREADS ) || defined( KOKKOS_ENABLE_OPENMP )
+  Kokkos::initialize( argc , argv );
+
+  int length_array = 100000 ;
+
+  for ( int i = 0 ; i < argc ; ++i ) {
+    if ( 0 == strcmp( argv[i] , "length_array" ) ) {
+      length_array = atoi( argv[i+1] );
+    }
+  }
+
+  int length_total_array  = length_array * 100;
+
+#if defined( KOKKOS_ENABLE_CUDA )
+  if ( Kokkos::Cuda::is_initialized() ) {
+    std::cout << "Kokkos::Cuda" << std::endl ;
+    Example::sort_array< Kokkos::Cuda >( length_array , length_total_array );
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_THREADS )
+  if ( Kokkos::Threads::is_initialized() ) {
+    std::cout << "Kokkos::Threads" << std::endl ;
+    Example::sort_array< Kokkos::Threads >( length_array , length_total_array );
+  }
+#endif
+
+#if defined( KOKKOS_ENABLE_OPENMP )
+  if ( Kokkos::OpenMP::is_initialized() ) {
+    std::cout << "Kokkos::OpenMP" << std::endl ;
+    Example::sort_array< Kokkos::OpenMP >( length_array , length_total_array );
+  }
+#endif
+
+  Kokkos::finalize();
+#endif
+
+  return 0 ;
+}
+
diff --git a/packages/kokkos/example/sort_array/sort_array.hpp b/packages/kokkos/example/sort_array/sort_array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae045b61af434d6f5cb9e64a71a10948b54fbd20
--- /dev/null
+++ b/packages/kokkos/example/sort_array/sort_array.hpp
@@ -0,0 +1,190 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef EXAMPLE_SORT_ARRAY
+#define EXAMPLE_SORT_ARRAY
+
+#include <cstdlib>
+#include <algorithm>
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_Timer.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Example {
+
+template< class Device >
+struct SortView {
+
+  template< typename ValueType >
+  SortView( const Kokkos::View<ValueType*,Device> v , int begin , int end )
+    {
+      std::sort( v.ptr_on_device() + begin , v.ptr_on_device() + end );
+    }
+};
+
+}
+
+#if defined(KOKKOS_ENABLE_CUDA)
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+
+namespace Example {
+
+template<>
+struct SortView< Kokkos::Cuda > {
+  template< typename ValueType >
+  SortView( const Kokkos::View<ValueType*,Kokkos::Cuda> v , int begin , int end )
+    {
+      thrust::sort( thrust::device_ptr<ValueType>( v.ptr_on_device() + begin )
+                  , thrust::device_ptr<ValueType>( v.ptr_on_device() + end ) );
+    }
+};
+
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Example {
+
+template< class Device >
+void sort_array( const size_t array_length /* length of spans of array to sort */
+               , const size_t total_length /* total length of array */
+               , const int print = 1 )
+{
+  typedef Device execution_space ;
+  typedef Kokkos::View<int*,Device>  device_array_type ;
+
+#if defined( KOKKOS_ENABLE_CUDA )
+
+  typedef typename
+    Kokkos::Impl::if_c< std::is_same< Device , Kokkos::Cuda >::value
+                      , Kokkos::View<int*,Kokkos::Cuda::array_layout,Kokkos::CudaHostPinnedSpace>
+                      , typename device_array_type::HostMirror
+                      >::type  host_array_type ;
+
+#else
+
+  typedef typename device_array_type::HostMirror  host_array_type ;
+
+#endif
+
+  Kokkos::Timer timer;
+
+  const device_array_type  work_array("work_array" , array_length );
+  const host_array_type    host_array("host_array" , total_length );
+
+  std::cout << "sort_array length( " << total_length << " )"
+            << " in chunks( " << array_length << " )"
+            << std::endl ;
+
+  double sec = timer.seconds();
+  std::cout << "declaring Views took "
+            << sec << " seconds" << std::endl;
+  timer.reset();
+
+  for ( size_t i = 0 ; i < total_length ; ++i ) {
+    host_array(i) = ( lrand48() * total_length ) >> 31 ;
+  }
+
+  sec = timer.seconds();
+  std::cout << "initializing " << total_length << " elements on host took "
+            << sec << " seconds" << std::endl;
+  timer.reset();
+
+  double sec_copy_in  = 0 ;
+  double sec_sort     = 0 ;
+  double sec_copy_out = 0 ;
+  double sec_error    = 0 ;
+  size_t error_count  = 0 ;
+
+  for ( size_t begin = 0 ; begin < total_length ; begin += array_length ) {
+
+    const size_t end = begin + array_length < total_length
+                     ? begin + array_length : total_length ;
+
+    const std::pair<size_t,size_t> host_range(begin,end);
+
+    const host_array_type host_subarray = Kokkos::subview( host_array , host_range );
+
+    timer.reset();
+
+    Kokkos::deep_copy( work_array , host_subarray );
+
+    sec_copy_in += timer.seconds(); timer.reset();
+
+    SortView< execution_space >( work_array , 0 , end - begin );
+
+    sec_sort += timer.seconds(); timer.reset();
+
+    Kokkos::deep_copy( host_subarray , work_array );
+
+    sec_copy_out += timer.seconds(); timer.reset();
+
+    for ( size_t i = begin + 1 ; i < end ; ++i ) {
+      if ( host_array(i) < host_array(i-1) ) ++error_count ;
+    }
+
+    sec_error += timer.seconds(); timer.reset();
+  }
+
+  std::cout << "copy to   device " << sec_copy_in  << " seconds" << std::endl
+            << "sort on   device " << sec_sort     << " seconds" << std::endl
+            << "copy from device " << sec_copy_out << " seconds" << std::endl
+            << "errors " << error_count << " took " << sec_error << " seconds" << std::endl
+            ;
+}
+
+} // namespace Example
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef EXAMPLE_SORT_ARRAY */
+
diff --git a/packages/kokkos/example/tutorial/01_hello_world/CMakeLists.txt b/packages/kokkos/example/tutorial/01_hello_world/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5e5b1fcb46ffbdcb7dacf3bcb6627fa90c7a1157
--- /dev/null
+++ b/packages/kokkos/example/tutorial/01_hello_world/CMakeLists.txt
@@ -0,0 +1,11 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_01_hello_world
+  SOURCES hello_world.cpp
+  COMM serial mpi
+  )
+
diff --git a/packages/kokkos/example/tutorial/01_hello_world/Makefile b/packages/kokkos/example/tutorial/01_hello_world/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..02a0fb10a05d2317fb057d33f5096ce4c5b69131
--- /dev/null
+++ b/packages/kokkos/example/tutorial/01_hello_world/Makefile
@@ -0,0 +1,48 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/01_hello_world/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 01_hello_world.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 01_hello_world.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp b/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3092b5a5746f08429ffa28304569552322754c66
--- /dev/null
+++ b/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp
@@ -0,0 +1,130 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <typeinfo>
+
+//
+// "Hello world" parallel_for example:
+//   1. Start up Kokkos
+//   2. Execute a parallel for loop in the default execution space,
+//      using a functor to define the loop body
+//   3. Shut down Kokkos
+//
+// If Kokkos was built with C++11 enabled, try comparing this example
+// to 01_hello_world_lambda.  The latter uses C++11 lambdas (anonymous
+// functions) to define the loop body of the parallel_for.  That makes
+// the code much more concise and readable.  On the other hand,
+// breaking out the loop body into an explicit functor makes it easier
+// to test the loop independently of the parallel pattern.
+//
+
+// Functor that defines the parallel_for's loop body.
+//
+// A "functor" is just a class or struct with a public operator()
+// instance method.
+struct hello_world {
+  // If a functor has an "execution_space" (or "execution_space", for
+  // backwards compatibility) public typedef, parallel_* will only run
+  // the functor in that execution space.  That's a good way to mark a
+  // functor as specific to an execution space.  If the functor lacks
+  // this typedef, parallel_for will run it in the default execution
+  // space, unless you tell it otherwise (that's an advanced topic;
+  // see "execution policies").
+
+  // The functor's operator() defines the loop body.  It takes an
+  // integer argument which is the parallel for loop index.  Other
+  // arguments are possible; see the "hierarchical parallelism" part
+  // of the tutorial.
+  //
+  // The operator() method must be const, and must be marked with the
+  // KOKKOS_INLINE_FUNCTION macro.  If building with CUDA, this macro
+  // will mark your method as suitable for running on the CUDA device
+  // (as well as on the host).  If not building with CUDA, the macro
+  // is unnecessary but harmless.
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const {
+    printf ("Hello from i = %i\n", i);
+  }
+};
+
+int main (int argc, char* argv[]) {
+  // You must call initialize() before you may call Kokkos.
+  //
+  // With no arguments, this initializes the default execution space
+  // (and potentially its host execution space) with default
+  // parameters.  You may also pass in argc and argv, analogously to
+  // MPI_Init().  It reads and removes command-line arguments that
+  // start with "--kokkos-".
+  Kokkos::initialize (argc, argv);
+
+  // Print the name of Kokkos' default execution space.  We're using
+  // typeid here, so the name might get a bit mangled by the linker,
+  // but you should still be able to figure out what it is.
+  printf ("Hello World on Kokkos execution space %s\n",
+          typeid (Kokkos::DefaultExecutionSpace).name ());
+
+  // Run the above functor on the default Kokkos execution space in
+  // parallel, with a parallel for loop count of 15.
+  //
+  // The Kokkos::DefaultExecutionSpace typedef gives the default
+  // execution space.  Depending on how Kokkos was configured, this
+  // could be OpenMP, Threads, Cuda, Serial, or even some other
+  // execution space.
+  //
+  // The following line of code would look like this in OpenMP:
+  //
+  // #pragma omp parallel for
+  // for (int i = 0; i < 15; ++i) {
+  //   printf ("Hello from i = %i\n", i);
+  // }
+  //
+  // You may notice that the printed numbers do not print out in
+  // order.  Parallel for loops may execute in any order.
+  Kokkos::parallel_for ("HelloWorld",15, hello_world ());
+
+  // You must call finalize() after you are done using Kokkos.
+  Kokkos::finalize ();
+}
+
diff --git a/packages/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3fcca4bceba577bf644f1929e1c62c1893b5d5a5
--- /dev/null
+++ b/packages/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF (Kokkos_ENABLE_CXX11)
+  # This is a tutorial, not a test, so we don't ask CTest to run it.
+  TRIBITS_ADD_EXECUTABLE(
+    tutorial_01_hello_world_lambda
+    SOURCES hello_world_lambda.cpp
+    COMM serial mpi
+    )
+ENDIF ()
+
diff --git a/packages/kokkos/example/tutorial/01_hello_world_lambda/Makefile b/packages/kokkos/example/tutorial/01_hello_world_lambda/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4fe3765c521978c565923f7eac746e08a0042b14
--- /dev/null
+++ b/packages/kokkos/example/tutorial/01_hello_world_lambda/Makefile
@@ -0,0 +1,49 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/01_hello_world_lambda/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 01_hello_world_lambda.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS += "enable_lambda"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 01_hello_world_lambda.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a304a0f3ab24059df0b1ced8c4ae4a2053a28bf5
--- /dev/null
+++ b/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
@@ -0,0 +1,112 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <typeinfo>
+
+//
+// "Hello world" parallel_for example:
+//   1. Start up Kokkos
+//   2. Execute a parallel for loop in the default execution space,
+//      using a C++11 lambda to define the loop body
+//   3. Shut down Kokkos
+//
+// This example only builds if C++11 is enabled.  Compare this example
+// to 01_hello_world, which uses functors (explicitly defined classes)
+// to define the loop body of the parallel_for.  Both functors and
+// lambdas have their places.
+//
+
+int main (int argc, char* argv[]) {
+  // You must call initialize() before you may call Kokkos.
+  //
+  // With no arguments, this initializes the default execution space
+  // (and potentially its host execution space) with default
+  // parameters.  You may also pass in argc and argv, analogously to
+  // MPI_Init().  It reads and removes command-line arguments that
+  // start with "--kokkos-".
+  Kokkos::initialize (argc, argv);
+
+  // Print the name of Kokkos' default execution space.  We're using
+  // typeid here, so the name might get a bit mangled by the linker,
+  // but you should still be able to figure out what it is.
+  printf ("Hello World on Kokkos execution space %s\n",
+          typeid (Kokkos::DefaultExecutionSpace).name ());
+
+  // Run lambda on the default Kokkos execution space in parallel,
+  // with a parallel for loop count of 15.  The lambda's argument is
+  // an integer which is the parallel for's loop index.  As you learn
+  // about different kinds of parallelism, you will find out that
+  // there are other valid argument types as well.
+  //
+  // For a single level of parallelism, we prefer that you use the
+  // KOKKOS_LAMBDA macro.  If CUDA is disabled, this just turns into
+  // [=].  That captures variables from the surrounding scope by
+  // value.  Do NOT capture them by reference!  If CUDA is enabled,
+  // this macro may have a special definition that makes the lambda
+  // work correctly with CUDA.  Compare to the KOKKOS_INLINE_FUNCTION
+  // macro, which has a special meaning if CUDA is enabled.
+  //
+  // The following parallel_for would look like this if we were using
+  // OpenMP by itself, instead of Kokkos:
+  //
+  // #pragma omp parallel for
+  // for (int i = 0; i < 15; ++i) {
+  //   printf ("Hello from i = %i\n", i);
+  // }
+  //
+  // You may notice that the printed numbers do not print out in
+  // order.  Parallel for loops may execute in any order.
+  // We also need to protect the usage of a lambda against compiling
+  // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  Kokkos::parallel_for (15, KOKKOS_LAMBDA (const int i) {
+      // printf works in a CUDA parallel kernel; std::ostream does not.
+      printf ("Hello from i = %i\n", i);
+    });
+#endif
+  // You must call finalize() after you are done using Kokkos.
+  Kokkos::finalize ();
+}
+
diff --git a/packages/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt b/packages/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7c78db840f849fd9625676c6a73e8aa037b52b4d
--- /dev/null
+++ b/packages/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_02_simple_reduce
+  SOURCES simple_reduce.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/02_simple_reduce/Makefile b/packages/kokkos/example/tutorial/02_simple_reduce/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..bda28fbac04c7aaa752d1b43791cca4b5b92f1f1
--- /dev/null
+++ b/packages/kokkos/example/tutorial/02_simple_reduce/Makefile
@@ -0,0 +1,57 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/02_simple_reduce/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 02_simple_reduce.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 02_simple_reduce.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 02_simple_reduce.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/02_simple_reduce/simple_reduce.cpp b/packages/kokkos/example/tutorial/02_simple_reduce/simple_reduce.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04a3cfb9a02097a483086ca0a9d9e3de90a58e52
--- /dev/null
+++ b/packages/kokkos/example/tutorial/02_simple_reduce/simple_reduce.cpp
@@ -0,0 +1,101 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+//
+// First reduction (parallel_reduce) example:
+//   1. Start up Kokkos
+//   2. Execute a parallel_reduce loop in the default execution space,
+//      using a functor to define the loop body
+//   3. Shut down Kokkos
+//
+// Compare this example to 02_simple_reduce_lambda, which uses a C++11
+// lambda to define the loop body of the parallel_reduce.
+//
+
+// Reduction functor for computing the sum of squares.
+//
+// More advanced reduction examples will show how to control the
+// reduction's "join" operator.  If the join operator is not provided,
+// it defaults to binary operator+ (adding numbers together).
+struct squaresum {
+  // Specify the type of the reduction value with a "value_type"
+  // typedef.  In this case, the reduction value has type int.
+  typedef int value_type;
+
+  // The reduction functor's operator() looks a little different than
+  // the parallel_for functor's operator().  For the reduction, we
+  // pass in both the loop index i, and the intermediate reduction
+  // value lsum.  The latter MUST be passed in by nonconst reference.
+  // (If the reduction type is an array like int[], indicating an
+  // array reduction result, then the second argument is just int[].)
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, int& lsum) const {
+    lsum += i*i; // compute the sum of squares
+  }
+};
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+  const int n = 10;
+
+  // Compute the sum of squares of integers from 0 to n-1, in
+  // parallel, using Kokkos.
+  int sum = 0;
+  Kokkos::parallel_reduce (n, squaresum (), sum);
+  printf ("Sum of squares of integers from 0 to %i, "
+          "computed in parallel, is %i\n", n - 1, sum);
+
+  // Compare to a sequential loop.
+  int seqSum = 0;
+  for (int i = 0; i < n; ++i) {
+    seqSum += i*i;
+  }
+  printf ("Sum of squares of integers from 0 to %i, "
+          "computed sequentially, is %i\n", n - 1, seqSum);
+  Kokkos::finalize ();
+  return (sum == seqSum) ? 0 : -1;
+}
+
diff --git a/packages/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e2e3a929f1ade97ce639670a3f28c43bb9ce084f
--- /dev/null
+++ b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt
@@ -0,0 +1,12 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF (Kokkos_ENABLE_CXX11)
+  # This is a tutorial, not a test, so we don't ask CTest to run it.
+  TRIBITS_ADD_EXECUTABLE(
+    tutorial_02_simple_reduce_lambda
+    SOURCES simple_reduce_lambda.cpp
+    COMM serial mpi
+    )
+ENDIF ()
diff --git a/packages/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..a9542c6a432459f14b8b29c003817583fd4c5662
--- /dev/null
+++ b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile
@@ -0,0 +1,59 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/02_simple_reduce_lambda/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 02_simple_reduce_lambda.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS += "enable_lambda"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 02_simple_reduce_lambda.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 02_simple_reduce_lambda.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ed5d8f62dfb49400cf28d529c8907c7f653b298
--- /dev/null
+++ b/packages/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
@@ -0,0 +1,94 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+//
+// First reduction (parallel_reduce) example:
+//   1. Start up Kokkos
+//   2. Execute a parallel_reduce loop in the default execution space,
+//      using a C++11 lambda to define the loop body
+//   3. Shut down Kokkos
+//
+// This example only builds if C++11 is enabled.  Compare this example
+// to 02_simple_reduce, which uses a functor to define the loop body
+// of the parallel_reduce.
+//
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+  const int n = 10;
+
+  // Compute the sum of squares of integers from 0 to n-1, in
+  // parallel, using Kokkos.  This time, use a lambda instead of a
+  // functor.  The lambda takes the same arguments as the functor's
+  // operator().
+  int sum = 0;
+  // The KOKKOS_LAMBDA macro replaces the capture-by-value clause [=].
+  // It also handles any other syntax needed for CUDA.
+  // We also need to protect the usage of a lambda against compiling
+  // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
+  #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  Kokkos::parallel_reduce (n, KOKKOS_LAMBDA (const int i, int& lsum) {
+      lsum += i*i;
+    }, sum);
+  #endif
+  printf ("Sum of squares of integers from 0 to %i, "
+          "computed in parallel, is %i\n", n - 1, sum);
+
+  // Compare to a sequential loop.
+  int seqSum = 0;
+  for (int i = 0; i < n; ++i) {
+    seqSum += i*i;
+  }
+  printf ("Sum of squares of integers from 0 to %i, "
+          "computed sequentially, is %i\n", n - 1, seqSum);
+  Kokkos::finalize ();
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  return (sum == seqSum) ? 0 : -1;
+#else
+  return 0;
+#endif
+}
+
diff --git a/packages/kokkos/example/tutorial/03_simple_view/CMakeLists.txt b/packages/kokkos/example/tutorial/03_simple_view/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7475a99e492bcf88c6a3ca9b98cc698fa9a38b3d
--- /dev/null
+++ b/packages/kokkos/example/tutorial/03_simple_view/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_03_simple_view
+  SOURCES simple_view.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/03_simple_view/Makefile b/packages/kokkos/example/tutorial/03_simple_view/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..de994a8df923d61e9bc0e4821590d427ee985bf4
--- /dev/null
+++ b/packages/kokkos/example/tutorial/03_simple_view/Makefile
@@ -0,0 +1,59 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/03_simple_view/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 03_simple_view.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 03_simple_view.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 03_simple_view.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+#for unit testing only, for best preformance with OpenMP 4.0 or better
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/03_simple_view/simple_view.cpp b/packages/kokkos/example/tutorial/03_simple_view/simple_view.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6930dc62476d073d72623dd73ced16210084610
--- /dev/null
+++ b/packages/kokkos/example/tutorial/03_simple_view/simple_view.cpp
@@ -0,0 +1,142 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+//
+// First Kokkos::View (multidimensional array) example:
+//   1. Start up Kokkos
+//   2. Allocate a Kokkos::View
+//   3. Execute a parallel_for and a parallel_reduce over that View's data
+//   4. Shut down Kokkos
+//
+// Compare this example to 03_simple_view_lambda, which uses C++11
+// lambdas to define the loop bodies of the parallel_for and
+// parallel_reduce.
+//
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// A Kokkos::View is an array of zero or more dimensions.  The number
+// of dimensions is specified at compile time, as part of the type of
+// the View.  This array has two dimensions.  The first one
+// (represented by the asterisk) is a run-time dimension, and the
+// second (represented by [3]) is a compile-time dimension.  Thus,
+// this View type is an N x 3 array of type double, where N is
+// specified at run time in the View's constructor.
+//
+// The first dimension of the View is the dimension over which it is
+// efficient for Kokkos to parallelize.
+typedef Kokkos::View<double*[3]> view_type;
+
+// parallel_for functor that fills the View given to its constructor.
+// The View must already have been allocated.
+struct InitView {
+  view_type a;
+
+  // Views have "view semantics."  This means that they behave like
+  // pointers, not like std::vector.  Their copy constructor and
+  // operator= only do shallow copies.  Thus, you can pass View
+  // objects around by "value"; they won't do a deep copy unless you
+  // explicitly ask for a deep copy.
+  InitView (view_type a_) :
+    a (a_)
+  {}
+
+  // Fill the View with some data.  The parallel_for loop will iterate
+  // over the View's first dimension N.
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i) const {
+    // Acesss the View just like a Fortran array.  The layout depends
+    // on the View's memory space, so don't rely on the View's
+    // physical memory layout unless you know what you're doing.
+    a(i,0) = 1.0*i;
+    a(i,1) = 1.0*i*i;
+    a(i,2) = 1.0*i*i*i;
+  }
+};
+
+// Reduction functor that reads the View given to its constructor.
+struct ReduceFunctor {
+  view_type a;
+
+  // Constructor takes View by "value"; this does a shallow copy.
+  ReduceFunctor (view_type a_) : a (a_) {}
+
+  // If you write a functor to do a reduction, you must specify the
+  // type of the reduction result via a public 'value_type' typedef.
+  typedef double value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, double &lsum) const {
+    lsum += a(i,0)*a(i,1)/(a(i,2)+0.1);
+  }
+};
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+  const int N = 10;
+
+  // Allocate the View.  The first dimension is a run-time parameter
+  // N.  We set N = 10 here.  The second dimension is a compile-time
+  // parameter, 3.  We don't specify it here because we already set it
+  // by declaring the type of the View.
+  //
+  // Views get initialized to zero by default.  This happens in
+  // parallel, using the View's memory space's default execution
+  // space.  Parallel initialization ensures first-touch allocation.
+  // There is a way to shut off default initialization.
+  //
+  // You may NOT allocate a View inside of a parallel_{for, reduce,
+  // scan}.  Treat View allocation as a "thread collective."
+  //
+  // The string "A" is just the label; it only matters for debugging.
+  // Different Views may have the same label.
+  view_type a ("A", N);
+
+  Kokkos::parallel_for (N, InitView (a));
+  double sum = 0;
+  Kokkos::parallel_reduce (N, ReduceFunctor (a), sum);
+  printf ("Result: %f\n", sum);
+  Kokkos::finalize ();
+}
+
diff --git a/packages/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..601fe452a4c90a1506aa012a6a99a617fbc1d9af
--- /dev/null
+++ b/packages/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt
@@ -0,0 +1,12 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF (Kokkos_ENABLE_CXX11)
+  # This is a tutorial, not a test, so we don't ask CTest to run it.
+  TRIBITS_ADD_EXECUTABLE(
+    tutorial_03_simple_view_lambda
+    SOURCES simple_view_lambda.cpp
+    COMM serial mpi
+    )
+ENDIF ()
diff --git a/packages/kokkos/example/tutorial/03_simple_view_lambda/Makefile b/packages/kokkos/example/tutorial/03_simple_view_lambda/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..81910a457144b412f44a0849cee5e0f4593c41f2
--- /dev/null
+++ b/packages/kokkos/example/tutorial/03_simple_view_lambda/Makefile
@@ -0,0 +1,59 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/03_simple_view_lambda/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 03_simple_view_lambda.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS += "enable_lambda"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 03_simple_view_lambda.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 03_simple_view_lambda.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp b/packages/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9d76cedfb5f7977a4388bee7c023aafd0fd6f65
--- /dev/null
+++ b/packages/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp
@@ -0,0 +1,120 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+//
+// First Kokkos::View (multidimensional array) example:
+//   1. Start up Kokkos
+//   2. Allocate a Kokkos::View
+//   3. Execute a parallel_for and a parallel_reduce over that View's data
+//   4. Shut down Kokkos
+//
+// Compare this example to 03_simple_view, which uses functors to
+// define the loop bodies of the parallel_for and parallel_reduce.
+//
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// A Kokkos::View is an array of zero or more dimensions.  The number
+// of dimensions is specified at compile time, as part of the type of
+// the View.  This array has two dimensions.  The first one
+// (represented by the asterisk) is a run-time dimension, and the
+// second (represented by [3]) is a compile-time dimension.  Thus,
+// this View type is an N x 3 array of type double, where N is
+// specified at run time in the View's constructor.
+//
+// The first dimension of the View is the dimension over which it is
+// efficient for Kokkos to parallelize.
+typedef Kokkos::View<double*[3]> view_type;
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+
+  // Allocate the View.  The first dimension is a run-time parameter
+  // N.  We set N = 10 here.  The second dimension is a compile-time
+  // parameter, 3.  We don't specify it here because we already set it
+  // by declaring the type of the View.
+  //
+  // Views get initialized to zero by default.  This happens in
+  // parallel, using the View's memory space's default execution
+  // space.  Parallel initialization ensures first-touch allocation.
+  // There is a way to shut off default initialization.
+  //
+  // You may NOT allocate a View inside of a parallel_{for, reduce,
+  // scan}.  Treat View allocation as a "thread collective."
+  //
+  // The string "A" is just the label; it only matters for debugging.
+  // Different Views may have the same label.
+  view_type a ("A", 10);
+
+  // Fill the View with some data.  The parallel_for loop will iterate
+  // over the View's first dimension N.
+  //
+  // Note that the View is passed by value into the lambda.  The macro
+  // KOKKOS_LAMBDA includes the "capture by value" clause [=].  This
+  // tells the lambda to "capture all variables in the enclosing scope
+  // by value."  Views have "view semantics"; they behave like
+  // pointers, not like std::vector.  Passing them by value does a
+  // shallow copy.  A deep copy never happens unless you explicitly
+  // ask for one.
+  // We also need to protect the usage of a lambda against compiling
+  // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
+  #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  Kokkos::parallel_for (10, KOKKOS_LAMBDA (const int i) {
+    // Acesss the View just like a Fortran array.  The layout depends
+    // on the View's memory space, so don't rely on the View's
+    // physical memory layout unless you know what you're doing.
+    a(i,0) = 1.0*i;
+    a(i,1) = 1.0*i*i;
+    a(i,2) = 1.0*i*i*i;
+  });
+  // Reduction functor that reads the View given to its constructor.
+  double sum = 0;
+  Kokkos::parallel_reduce (10, KOKKOS_LAMBDA (const int i, double& lsum) {
+    lsum += a(i,0)*a(i,1)/(a(i,2)+0.1);
+  }, sum);
+  printf ("Result: %f\n", sum);
+  #endif
+  Kokkos::finalize ();
+}
+
diff --git a/packages/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt b/packages/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..09f209077a08d64c86454a59875ecda8d329e2f7
--- /dev/null
+++ b/packages/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_04_simple_memoryspaces
+  SOURCES simple_memoryspaces.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/04_simple_memoryspaces/Makefile b/packages/kokkos/example/tutorial/04_simple_memoryspaces/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..0e84ac9c6819fd501a5fde8d872b63ceb3c382ae
--- /dev/null
+++ b/packages/kokkos/example/tutorial/04_simple_memoryspaces/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/04_simple_memoryspaces/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 04_simple_memoryspaces.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 04_simple_memoryspaces.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 04_simple_memoryspaces.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp b/packages/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..13df7cf450e104be75b21e3b72aadea27131dda0
--- /dev/null
+++ b/packages/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp
@@ -0,0 +1,101 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// The type of a two-dimensional N x 3 array of double.
+// It lives in Kokkos' default memory space.
+typedef Kokkos::View<double*[3]> view_type;
+
+// The "HostMirror" type corresponding to view_type above is also a
+// two-dimensional N x 3 array of double.  However, it lives in the
+// host memory space corresponding to view_type's memory space.  For
+// example, if view_type lives in CUDA device memory, host_view_type
+// lives in host (CPU) memory.  Furthermore, declaring host_view_type
+// as the host mirror of view_type means that host_view_type has the
+// same layout as view_type.  This makes it easier to copy between the
+// two Views.
+// Advanced issues: If a memory space is accessible from the host without
+// performance penalties then it is its own host_mirror_space. This is
+// the case for HostSpace, CudaUVMSpace and CudaHostPinnedSpace.
+
+typedef view_type::HostMirror host_view_type;
+
+struct ReduceFunctor {
+  view_type a;
+  ReduceFunctor (view_type a_) : a (a_) {}
+  typedef int value_type; //Specify type for reduction value, lsum
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, int &lsum) const {
+    lsum += a(i,0)-a(i,1)+a(i,2);
+  }
+};
+
+int main() {
+  Kokkos::initialize();
+
+  view_type a ("A", 10);
+  // If view_type and host_mirror_type live in the same memory space,
+  // a "mirror view" is just an alias, and deep_copy does nothing.
+  // Otherwise, a mirror view of a device View lives in host memory,
+  // and deep_copy does a deep copy.
+  host_view_type h_a = Kokkos::create_mirror_view (a);
+
+  // The View h_a lives in host (CPU) memory, so it's legal to fill
+  // the view sequentially using ordinary code, like this.
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 3; j++) {
+      h_a(i,j) = i*10 + j;
+    }
+  }
+  Kokkos::deep_copy (a, h_a); // Copy from host to device.
+
+  int sum = 0;
+  Kokkos::parallel_reduce (10, ReduceFunctor (a), sum);
+  printf ("Result is %i\n",sum);
+
+  Kokkos::finalize ();
+}
+
diff --git a/packages/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt b/packages/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a5790fb0488be791112c3ef0c38655e6da78724
--- /dev/null
+++ b/packages/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_05_simple_atomics
+  SOURCES simple_atomics.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/05_simple_atomics/Makefile b/packages/kokkos/example/tutorial/05_simple_atomics/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..67fbd90c55fd05ef1af060d1812f50af591a3822
--- /dev/null
+++ b/packages/kokkos/example/tutorial/05_simple_atomics/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/05_simple_atomics/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 05_simple_atomics.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 05_simple_atomics.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 05_simple_atomics.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp b/packages/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..585ba29fa7633bcd280bbbeb3a87768e56bc46a5
--- /dev/null
+++ b/packages/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp
@@ -0,0 +1,137 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+// Type of a one-dimensional length-N array of int.
+typedef Kokkos::View<int*> view_type;
+typedef view_type::HostMirror host_view_type;
+// This is a "zero-dimensional" View, that is, a View of a single
+// value (an int, in this case).  Access the value using operator()
+// with no arguments: e.g., 'count()'.
+//
+// Zero-dimensional Views are useful for reduction results that stay
+// resident in device memory, as well as for irregularly updated
+// shared state.  We use it for the latter in this example.
+typedef Kokkos::View<int> count_type;
+typedef count_type::HostMirror host_count_type;
+
+
+// Functor for finding a list of primes in a given set of numbers.  If
+// run in parallel, the order of results is nondeterministic, because
+// hardware atomic updates do not guarantee an order of execution.
+struct findprimes {
+  view_type data;
+  view_type result;
+  count_type count;
+
+  findprimes (view_type data_, view_type result_, count_type count_) :
+    data (data_), result (result_), count (count_)
+  {}
+
+  // Test if data(i) is prime.  If it is, increment the count of
+  // primes (stored in the zero-dimensional View 'count') and add the
+  // value to the current list of primes 'result'.
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const {
+    const int number = data(i); // the current number
+
+    // Test all numbers from 3 to ceiling(sqrt(data(i))), to see if
+    // they are factors of data(i).  It's not the most efficient prime
+    // test, but it works.
+    const int upper_bound = std::sqrt(1.0*number)+1;
+    bool is_prime = !(number%2 == 0);
+    int k = 3;
+    while (k < upper_bound && is_prime) {
+      is_prime = !(number%k == 0);
+      k += 2; // don't have to test even numbers
+    }
+
+    if (is_prime) {
+      // Use an atomic update both to update the current count of
+      // primes, and to find a place in the current list of primes for
+      // the new result.
+      //
+      // atomic_fetch_add results the _current_ count, but increments
+      // it (by 1 in this case).  The current count of primes indexes
+      // into the first unoccupied position of the 'result' array.
+      const int idx = Kokkos::atomic_fetch_add (&count(), 1);
+      result(idx) = number;
+    }
+  }
+
+};
+
+int main () {
+  Kokkos::initialize ();
+
+  srand (61391); // Set the random seed
+
+  int nnumbers = 100000;
+  view_type data ("RND", nnumbers);
+  view_type result ("Prime", nnumbers);
+  count_type count ("Count");
+
+  host_view_type h_data = Kokkos::create_mirror_view (data);
+  host_view_type h_result = Kokkos::create_mirror_view (result);
+  host_count_type h_count = Kokkos::create_mirror_view (count);
+
+  typedef view_type::size_type size_type;
+  // Fill the 'data' array on the host with random numbers.  We assume
+  // that they come from some process which is only implemented on the
+  // host, via some library.  (That's true in this case.)
+  for (size_type i = 0; i < data.extent(0); ++i) {
+    h_data(i) = rand () % nnumbers;
+  }
+  Kokkos::deep_copy (data, h_data); // copy from host to device
+
+  Kokkos::parallel_for (data.extent(0), findprimes (data, result, count));
+  Kokkos::deep_copy (h_count, count); // copy from device to host
+
+  printf ("Found %i prime numbers in %i random numbers\n", h_count(), nnumbers);
+  Kokkos::finalize ();
+}
+
diff --git a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d18938a61f8b75db1c7cd4b5e9c42c4406429e89
--- /dev/null
+++ b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_06_simple_mdrangepolicy
+  SOURCES simple_mdrangepolicy.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/Makefile b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..7d3498ed1780281d3f0cbfa827e13fc2c09f1e57
--- /dev/null
+++ b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/Makefile
@@ -0,0 +1,48 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/06_simple_mdrangepolicy/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 06_simple_mdrangepolicy.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 06_simple_mdrangepolicy.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..38cf00dd19409eb576e41481f95ffd11f5599fcb
--- /dev/null
+++ b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
@@ -0,0 +1,201 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+//
+// MDRangePolicy example with parallel_for and parallel_reduce:
+//   1. Start up Kokkos
+//   2. Execute a parallel_for loop in the default execution space,
+//      using a functor to define the loop body
+//   3. Shut down Kokkos
+//
+// Two examples are provided:
+// Example 1: Rank 2 case with minimal default parameters and arguments used 
+//            in the MDRangePolicy
+//
+// Example 2: Rank 3 case with additional outer/inner iterate pattern parameters
+//            and tile dims passed to the ctor
+
+
+// Simple functor for computing/storing the product of indices in a View v
+template < class ViewType >
+struct MDFunctor {
+
+  typedef long value_type;
+
+  ViewType v;
+  size_t   size;
+
+  MDFunctor( const ViewType & v_, const size_t size_ )
+    : v(v_), size(size_) {}
+
+  // 2D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, const int j) const {
+    v(i,j) = i*j; // compute the product of indices
+  }
+
+  // 3D case - used by parallel_for
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, const int j, const int k) const {
+    v(i,j,k) = i*j*k; // compute the product of indices
+  }
+
+  // 2D case - reduction
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, const int j, value_type & incorrect_count) const {
+    if ( v(i,j) != i*j ) {
+      incorrect_count += 1;
+    }
+  }
+
+  // 3D case - reduction
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, const int j, const int k, value_type & incorrect_count) const {
+    if ( v(i,j,k) != i*j*k ) {
+      incorrect_count += 1;
+    }
+  }
+
+};
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+
+  // Bound(s) for MDRangePolicy 
+  const int n = 100;
+
+  // ViewType typedefs for Rank<2>, Rank<3> for example usage
+  typedef double ScalarType;
+  typedef typename Kokkos::View<ScalarType**>   ViewType_2D;
+  typedef typename Kokkos::View<ScalarType***>  ViewType_3D;
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Explanation of MDRangePolicy usage, template parameters, constructor arguments
+  //
+  // MDRangePolicy typedefs for Rank<2>, Rank<3> cases
+  // Required template parameters: 
+  //   Kokkos::Rank<N>: where N=rank
+  //
+  // Optional template parameters to Rank<...>:
+  //   Kokkos::Iterate::{Default,Left,Right}: Outer iteration pattern across tiles; 
+  //     defaults based on the execution space similar to Kokkos::Layout
+  //   Kokkos::Iterate::{Default,Left,Right}: Inner iteration pattern within tiles; 
+  //     defaults based on the execution space similar to Kokkos::Layout
+  //
+  //   e.g. typedef Rank<2, Iterate::Left, Iterate::Left> rank2ll;
+  //
+  //
+  // Optional template parameters to MDRangePolicy:
+  //   ExecutionSpace: Kokkos::Serial, Kokkos::OpenMP, Kokkos::Cuda, etc. 
+  //
+  //   Kokkos::IndexType< T >: where T = int, long, unsigned int, etc.
+  //
+  //   struct Tag{}: A user-provided tag for tagging functor operators
+  //
+  //   e.g. 1:  MDRangePolicy< Kokkos::Serial, Rank<2, Iterate::Left, Iterate::Left>, IndexType<int>, Tag > mdpolicy;
+  //   e.g. 2:  MDRangePolicy< Kokkos::Serial, rank2ll, IndexType<int>, Tag > mdpolicy;
+  //
+  //
+  // Required arguments to ctor:
+  //   {{ l0, l1, ... }}: Lower bounds, provided as Kokkos::Array or std::initializer_list
+  //   {{ u0, u1, ... }}: Upper bounds, provided as Kokkos::Array or std::initializer_list
+  //
+  // Optional arguments to ctor:
+  //   {{ t0, t1, ... }}: Tile dimensions, provided as Kokkos::Array or std::initializer_list
+  //                      defaults based on the execution space
+  //
+  //  e.g. mdpolicy( {{0,0}}, {{u0,u1}}, {{t0,t1}};
+  //   
+  /////////////////////////////////////////////////////////////////////////////
+
+  // Example 1: 
+  long incorrect_count_2d = 0;
+  {
+    // Rank<2> Case: Rank is provided, all other parameters are default
+    typedef typename Kokkos::Experimental::MDRangePolicy< Kokkos::Experimental::Rank<2> > MDPolicyType_2D;
+
+    // Construct 2D MDRangePolicy: lower and upper bounds provided, tile dims defaulted
+    MDPolicyType_2D mdpolicy_2d( {{0,0}}, {{n,n}} );
+
+    // Construct a 2D view to store result of product of indices
+    ViewType_2D v2("v2", n, n);
+
+    // Execute parallel_for with rank 2 MDRangePolicy
+    Kokkos::parallel_for( "md2d", mdpolicy_2d, MDFunctor<ViewType_2D>(v2, n) );
+
+    // Check results with a parallel_reduce using the MDRangePolicy
+    Kokkos::parallel_reduce( "md2dredux", mdpolicy_2d, MDFunctor<ViewType_2D>(v2, n), incorrect_count_2d );
+
+    printf("Rank 2 MDRangePolicy incorrect count: %ld\n", incorrect_count_2d); // should be 0
+  }
+
+
+  // Example 2: 
+  long incorrect_count_3d = 0;
+  {
+    // Rank<3> Case: Rank, inner iterate pattern, outer iterate pattern provided
+    typedef typename Kokkos::Experimental::MDRangePolicy< Kokkos::Experimental::Rank<3, Kokkos::Experimental::Iterate::Left, Kokkos::Experimental::Iterate::Left> > MDPolicyType_3D;
+
+    // Construct 3D MDRangePolicy: lower, upper bounds, tile dims provided
+    MDPolicyType_3D mdpolicy_3d( {{0,0,0}}, {{n,n,n}}, {{4,4,4}} );
+
+    // Construct a 3D view to store result of product of indices
+    ViewType_3D v3("v3", n, n, n);
+
+    // Execute parallel_for with rank 3 MDRangePolicy
+    Kokkos::parallel_for( "md3d", mdpolicy_3d, MDFunctor<ViewType_3D>(v3, n) );
+
+    // Check results with a parallel_reduce using the MDRangePolicy
+    Kokkos::parallel_reduce( "md3dredux", mdpolicy_3d, MDFunctor<ViewType_3D>(v3, n), incorrect_count_3d );
+
+    printf("Rank 3 MDRangePolicy incorrect count: %ld\n", incorrect_count_3d); // should be 0
+  }
+
+  Kokkos::finalize ();
+
+  return (incorrect_count_2d == long(0) && incorrect_count_3d == long(0)) ? 0 : -1;
+}
+
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2eb3a8f6c98d69c394f83591e59aa7073f1e59e2
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_advancedviews_01_data_layouts
+  SOURCES data_layouts.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..94ace811f307d3414fe9ea7dfa8a6d0d34e1a618
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/01_data_layouts/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 01_data_layouts.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 01_data_layouts.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 01_data_layouts.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..247bb44ba197ac5d22855d08b91a837f39937e52
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
@@ -0,0 +1,171 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+
+// These two View types are both 2-D arrays of double.  However, they
+// have different layouts in memory.  left_type has "layout left,"
+// which means "column major," the same as in Fortran, the BLAS, or
+// LAPACK.  right_type has "layout right," which means "row major,"
+// the same as in C, C++, or Java.
+typedef Kokkos::View<double**, Kokkos::LayoutLeft> left_type;
+typedef Kokkos::View<double**, Kokkos::LayoutRight> right_type;
+// This is a one-dimensional View, so the layout matters less.
+// However, it still has a layout!  Since its layout is not specified
+// explicitly in the type, its layout is a function of the memory
+// space.  For example, the default Cuda layout is LayoutLeft, and the
+// default Host layout is LayoutRight.
+typedef Kokkos::View<double*> view_type;
+
+// parallel_for functor that fills the given View with some data.  It
+// expects to access the View by rows in parallel: each call i of
+// operator() accesses a row.
+template<class ViewType>
+struct init_view {
+  ViewType a;
+  init_view (ViewType a_) : a (a_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const typename ViewType::size_type i) const {
+    // On CPUs this loop could be vectorized so j should do stride 1
+    // access on a for optimal performance. I.e. a should be LayoutRight.
+    // On GPUs threads should do coalesced loads and stores. That means
+    // that i should be the stride one access for optimal performance.
+    for (typename ViewType::size_type j = 0; j < a.extent(1); ++j) {
+      a(i,j) = 1.0*a.extent(0)*i + 1.0*j;
+    }
+  }
+};
+
+// Compute a contraction of v1 and v2 into a:
+//
+//   a(i) := sum_j (v1(i,j) * v2(j,i))
+//
+// Since the functor is templated on the ViewTypes itself it doesn't matter what
+// there layouts are. That means you can use different layouts on different
+// architectures.
+template<class ViewType1, class ViewType2>
+struct contraction {
+  view_type a;
+  typename ViewType1::const_type v1;
+  typename ViewType2::const_type v2;
+  contraction (view_type a_, ViewType1 v1_, ViewType2 v2_) :
+    a (a_), v1 (v1_), v2 (v2_)
+  {}
+
+  // As with the initialization functor the performance of this operator
+  // depends on the architecture and the chosen data layouts.
+  // On CPUs optimal would be to vectorize the inner loop, so j should be the
+  // stride 1 access. That means v1 should be LayoutRight and v2 LayoutLeft.
+  // In order to get coalesced access on GPUs where i corresponds closely to
+  // the thread Index, i must be the stride 1 dimension. That means v1 should be
+  // LayoutLeft and v2 LayoutRight.
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const view_type::size_type i) const {
+    for (view_type::size_type j = 0; j < v1.extent(1); ++j) {
+      a(i) = v1(i,j)*v2(j,i);
+    }
+  }
+};
+
+// Compute a dot product. This is used for result verification.
+struct dot {
+  view_type a;
+  dot (view_type a_) : a (a_) {}
+  typedef double value_type; //Specify type for reduction target, lsum
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const view_type::size_type i, double &lsum) const {
+    lsum += a(i)*a(i);
+  }
+};
+
+int main (int narg, char* arg[]) {
+  // When initializing Kokkos, you may pass in command-line arguments,
+  // just like with MPI_Init().  Kokkos reserves the right to remove
+  // arguments from the list that start with '--kokkos-'.
+  Kokkos::initialize (narg, arg);
+
+  int size = 10000;
+  view_type a("A",size);
+
+  // Define two views with LayoutLeft and LayoutRight.
+  left_type l("L",size,10000);
+  right_type r("R",size,10000);
+
+  // Initialize the data in the views.
+  Kokkos::parallel_for(size,init_view<left_type>(l));
+  Kokkos::parallel_for(size,init_view<right_type>(r));
+  Kokkos::fence();
+
+  // Measure time to execute the contraction kernel when giving it a
+  // LayoutLeft view for v1 and a LayoutRight view for v2. This should be
+  // fast on GPUs and slow on CPUs
+  Kokkos::Timer time1;
+  Kokkos::parallel_for(size,contraction<left_type,right_type>(a,l,r));
+  Kokkos::fence();
+  double sec1 = time1.seconds();
+
+  double sum1 = 0;
+  Kokkos::parallel_reduce(size,dot(a),sum1);
+  Kokkos::fence();
+
+  // Measure time to execute the contraction kernel when giving it a
+  // LayoutRight view for v1 and a LayoutLeft view for v2. This should be
+  // fast on CPUs and slow on GPUs
+  Kokkos::Timer time2;
+  Kokkos::parallel_for(size,contraction<right_type,left_type>(a,r,l));
+  Kokkos::fence();
+  double sec2 = time2.seconds();
+
+  double sum2 = 0;
+  Kokkos::parallel_reduce(size,dot(a),sum2);
+
+  // Kokkos' reductions are deterministic.
+  // The results should always be equal.
+  printf("Result Left/Right %f Right/Left %f (equal result: %i)\n",sec1,sec2,sum2==sum1);
+
+  Kokkos::finalize();
+}
+
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1963e544d7a113e8b50cf3fa2444df2f95d983e2
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_advancedviews_02_memory_traits
+  SOURCES memory_traits.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f64ee3540e8f72dc8312ec1a176af55f4a740643
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/02_memory_traits/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 02_memory_traits.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 02_memory_traits.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 02_memory_traits.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75bfdd9cf5952606dd3143dc97899a0f040b8c67
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
@@ -0,0 +1,141 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+#include <cstdlib>
+
+typedef Kokkos::View<double*> view_type;
+// Kokkos::Views have an MemoryTraits template parameter which
+// allows users to specify usage scenarios of a View.
+// Some of those act simply as hints, which can be used to insert
+// optimal load and store paths, others change the symantics of the
+// access. The trait Kokkos::Atomic is one of the latter. A view with
+// that MemoryTrait will perform any access atomicly (read, write, update).
+//
+// In this example we use a view with a usage hint for RandomAccess.
+// Kokkos::RandomAccess means that we expect to use this view
+// with indirect indexing.
+//
+// In CUDA, RandomAccess allows accesses through the texture
+// cache.  This only works if the View is read-only, which we enforce
+// through the first template parameter.
+//
+// Note that we are still talking about views of the data, its not a new allocation.
+// For example you can have an atomic view of a default view. While you even
+// could use both in the same kernel, this could lead to undefined behaviour because
+// one of your access paths is not atomic. Think of it in the same way as you think of
+// pointers to const data and pointers to non-const data (i.e. const double* and double*).
+// While these pointers can point to the same data you should not use them together if that
+// brakes the const guarantee of the first pointer.
+typedef Kokkos::View<const double*, Kokkos::MemoryTraits<Kokkos::RandomAccess> > view_type_rnd;
+typedef Kokkos::View<int**> idx_type;
+typedef idx_type::HostMirror idx_type_host;
+
+// We template this functor on the ViewTypes to show the effect of the RandomAccess trait.
+template<class DestType, class SrcType>
+struct localsum {
+  idx_type::const_type idx;
+  DestType dest;
+  SrcType src;
+  localsum (idx_type idx_, DestType dest_, SrcType src_) :
+    idx (idx_), dest (dest_), src (src_)
+  {}
+
+  // Calculate a local sum of values
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const {
+    double tmp = 0.0;
+    for (int j = 0; j < (int) idx.extent(1); ++j) {
+      // This is an indirect access on src
+      const double val = src(idx(i,j));
+      tmp += val*val + 0.5*(idx.extent(0)*val -idx.extent(1)*val);
+    }
+    dest(i) = tmp;
+  }
+};
+
+int main(int narg, char* arg[]) {
+  Kokkos::initialize (narg, arg);
+
+  int size = 1000000;
+
+  idx_type idx("Idx",size,64);
+  idx_type_host h_idx = Kokkos::create_mirror_view (idx);
+
+  view_type dest ("Dest", size);
+  view_type src ("Src", size);
+
+  srand(134231);
+
+  for (int i = 0; i < size; i++) {
+    for (view_type::size_type j = 0; j < h_idx.extent(1); ++j) {
+      h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
+    }
+  }
+
+  // Deep copy the initial data to the device
+  Kokkos::deep_copy(idx,h_idx);
+  // Run the first kernel to warmup caches
+  Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
+  Kokkos::fence();
+
+  // Run the localsum functor using the RandomAccess trait. On CPUs there should
+  // not be any different in performance to not using the RandomAccess trait.
+  // On GPUs where can be a dramatic difference
+  Kokkos::Timer time1;
+  Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
+  Kokkos::fence();
+  double sec1 = time1.seconds();
+
+  Kokkos::Timer time2;
+  Kokkos::parallel_for(size,localsum<view_type,view_type>(idx,dest,src));
+  Kokkos::fence();
+  double sec2 = time2.seconds();
+
+  printf("Time with Trait RandomAccess: %f with Plain: %f \n",sec1,sec2);
+
+  Kokkos::finalize();
+}
+
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cbe394c78b832f7bee3bb659b2776d5b246adbd1
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_advancedviews_03_subviews
+  SOURCES subviews.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..ad70ee02d16cf26fa428462385242dfcc7ba4b05
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/03_subviews/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 03_subviews.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 03_subviews.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 03_subviews.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e833434903f7d89828449d041514ef0a66cc37d
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
@@ -0,0 +1,190 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// This example simulates one timestep of an explicit
+// finite-difference discretization of a time-dependent partial
+// differential equation (PDE).  It shows how to take subviews of the
+// mesh in order to represent particular boundaries or the interior of
+// the mesh.
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+
+typedef Kokkos::View<double***, Kokkos::LayoutRight> mesh_type;
+
+// These View types represent subviews of the mesh.  Some of the Views
+// have layout LayoutStride, meaning that they have run-time "strides"
+// in each dimension which may differ from that dimension.  For
+// example, inner_mesh_type (which represents the interior of the
+// mesh) has to skip over the boundaries when computing its stride;
+// the dimensions of the interior mesh differ from these strides.  You
+// may safely always use a LayoutStride layout when taking a subview
+// of a LayoutRight or LayoutLeft subview, but strided accesses may
+// cost a bit more, especially for 1-D Views.
+typedef Kokkos::View<double**, Kokkos::LayoutStride> xz_plane_type;
+typedef Kokkos::View<double**, Kokkos::LayoutRight> yz_plane_type;
+typedef Kokkos::View<double**, Kokkos::LayoutStride> xy_plane_type;
+typedef Kokkos::View<double***, Kokkos::LayoutStride> inner_mesh_type;
+
+// Functor to set all entries of a boundary of the mesh to a constant
+// value.  The functor is templated on ViewType because different
+// boundaries may have different layouts.
+template<class ViewType>
+struct set_boundary {
+  ViewType a;
+  double value;
+
+  set_boundary (ViewType a_, double value_) :
+    a (a_), value (value_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const typename ViewType::size_type i) const {
+    for (typename ViewType::size_type j = 0; j < a.extent(1); ++j) {
+      a(i,j) = value;
+    }
+  }
+};
+
+// Functor to set all entries of a boundary of the mesh to a constant
+// value.  The functor is templated on ViewType because different
+// boundaries may have different layouts.
+template<class ViewType>
+struct set_inner {
+  ViewType a;
+  double value;
+
+  set_inner (ViewType a_, double value_) :
+    a (a_), value (value_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const typename ViewType::size_type i) const {
+    typedef typename ViewType::size_type size_type;
+    for (size_type j = 0; j < a.extent(1); ++j) {
+      for (size_type k = 0; k < a.extent(2); ++k) {
+        a(i,j,k) = value;
+      }
+    }
+  }
+};
+
+// Update the interior of the mesh.  This simulates one timestep of a
+// finite-difference method.
+template<class ViewType>
+struct update {
+  ViewType a;
+  const double dt;
+
+  update (ViewType a_, const double dt_) :
+    a (a_), dt (dt_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename ViewType::size_type i) const {
+    typedef typename ViewType::size_type size_type;
+    i++;
+    for (size_type j = 1; j < a.extent(1)-1; j++) {
+      for (size_type k = 1; k < a.extent(2)-1; k++) {
+        a(i,j,k) += dt* (a(i,j,k+1) - a(i,j,k-1) +
+                         a(i,j+1,k) - a(i,j-1,k) +
+                         a(i+1,j,k) - a(i-1,j,k));
+      }
+    }
+  }
+};
+
+
+int main (int narg, char* arg[]) {
+  using Kokkos::ALL;
+  using Kokkos::pair;
+  using Kokkos::parallel_for;
+  using Kokkos::subview;
+  typedef mesh_type::size_type size_type;
+
+  Kokkos::initialize (narg, arg);
+
+  // The number of mesh points along each dimension of the mesh, not
+  // including boundaries.
+  const size_type size = 100;
+
+  // A is the full cubic 3-D mesh, including the boundaries.
+  mesh_type A ("A", size+2, size+2, size+2);
+  // Ai is the "inner" part of A, _not_ including the boundaries.
+  //
+  // A pair of indices in a particular dimension means the contiguous
+  // zero-based index range in that dimension, including the first
+  // entry of the pair but _not_ including the second entry.
+  inner_mesh_type Ai = subview(A, pair<size_type, size_type> (1, size+1),
+                                  pair<size_type, size_type> (1, size+1),
+                                  pair<size_type, size_type> (1, size+1));
+  // A has six boundaries, one for each face of the cube.
+  // Create a View of each of these boundaries.
+  // ALL() means "select all indices in that dimension."
+  xy_plane_type Zneg_halo = subview(A, ALL (), ALL (), 0);
+  xy_plane_type Zpos_halo = subview(A, ALL (), ALL (), 101);
+  xz_plane_type Yneg_halo = subview(A, ALL (), 0, ALL ());
+  xz_plane_type Ypos_halo = subview(A, ALL (), 101, ALL ());
+  yz_plane_type Xneg_halo = subview(A, 0, ALL (), ALL ());
+  yz_plane_type Xpos_halo = subview(A, 101, ALL (), ALL ());
+
+  // Set the boundaries to their initial conditions.
+  parallel_for (Zneg_halo.extent(0), set_boundary<xy_plane_type> (Zneg_halo,  1));
+  parallel_for (Zpos_halo.extent(0), set_boundary<xy_plane_type> (Zpos_halo, -1));
+  parallel_for (Yneg_halo.extent(0), set_boundary<xz_plane_type> (Yneg_halo,  2));
+  parallel_for (Ypos_halo.extent(0), set_boundary<xz_plane_type> (Ypos_halo, -2));
+  parallel_for (Xneg_halo.extent(0), set_boundary<yz_plane_type> (Xneg_halo,  3));
+  parallel_for (Xpos_halo.extent(0), set_boundary<yz_plane_type> (Xpos_halo, -3));
+
+  // Set the interior of the mesh to its initial condition.
+  parallel_for (Ai.extent(0), set_inner<inner_mesh_type> (Ai, 0));
+
+  // Update the interior of the mesh.
+  // This simulates one timestep with dt = 0.1.
+  parallel_for (Ai.extent(0), update<mesh_type> (A, 0.1));
+
+  printf ("Done\n");
+  Kokkos::finalize ();
+}
+
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..300dab128e45779002cf123d7e7238777abab4d5
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_advancedviews_04_dualviews
+  SOURCES dual_view.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..e08be5c1dfb219d0737e17d39153f6b1dc05410a
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/04_dualviews/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 04_dualviews.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 04_dualviews.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 04_dualviews.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e0fa559587063337f70525e062d5d98c116c0310
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
@@ -0,0 +1,218 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DualView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+#include <cstdlib>
+
+// DualView helps you manage data and computations that take place on
+// two different memory spaces.  Examples include CUDA device memory
+// and (CPU) host memory (currently implemented), or Intel Knights
+// Landing MCDRAM and DRAM (not yet implemented).  For example, if you
+// have ported only some parts of you application to run in CUDA,
+// DualView can help manage moving data between the parts of your
+// application that work best with CUDA, and the parts that work
+// better on the CPU.
+//
+// A DualView takes the same template parameters as a View, but
+// contains two Views: One that lives in the DualView's memory space,
+// and one that lives in that memory space's host mirror space.  If
+// both memory spaces are the same, then the two Views just alias one
+// another.  This means that you can use DualView all the time, even
+// when not running in a memory space like CUDA.  DualView's
+// operations to help you manage memory take almost no time in that
+// case.  This makes your code even more performance portable.
+
+typedef Kokkos::DualView<double*> view_type;
+typedef Kokkos::DualView<int**> idx_type;
+
+
+template<class ExecutionSpace>
+struct localsum {
+  // If the functor has a public 'execution_space' typedef, that defines
+  // the functor's execution space (where it runs in parallel).  This
+  // overrides Kokkos' default execution space.
+  typedef ExecutionSpace execution_space;
+
+  typedef typename Kokkos::Impl::if_c<std::is_same<ExecutionSpace,Kokkos::DefaultExecutionSpace>::value ,
+     idx_type::memory_space, idx_type::host_mirror_space>::type memory_space;
+
+  // Get the view types on the particular device for which the functor
+  // is instantiated.
+  //
+  // "const_data_type" is a typedef in View (and DualView) which is
+  // the const version of the first template parameter of the View.
+  // For example, the const_data_type version of double** is const
+  // double**.
+  Kokkos::View<idx_type::const_data_type, idx_type::array_layout, memory_space> idx;
+  // "scalar_array_type" is a typedef in ViewTraits (and DualView) which is the
+  // array version of the value(s) stored in the View.
+  Kokkos::View<view_type::scalar_array_type, view_type::array_layout, memory_space> dest;
+  Kokkos::View<view_type::const_data_type, view_type::array_layout,
+               memory_space, Kokkos::MemoryRandomAccess> src;
+
+  // Constructor takes DualViews, synchronizes them to the device,
+  // then marks them as modified on the device.
+  localsum (idx_type dv_idx, view_type dv_dest, view_type dv_src)
+  {
+    // Extract the view on the correct Device (i.e., the correct
+    // memory space) from the DualView.  DualView has a template
+    // method, view(), which is templated on the memory space.  If the
+    // DualView has a View from that memory space, view() returns the
+    // View in that space.
+    idx = dv_idx.view<memory_space> ();
+    dest = dv_dest.template view<memory_space> ();
+    src = dv_src.template view<memory_space> ();
+
+    // Synchronize the DualView to the correct Device.
+    //
+    // DualView's sync() method is templated on a memory space, and
+    // synchronizes the DualView in a one-way fashion to that memory
+    // space.  "Synchronizing" means copying, from the other memory
+    // space to the Device memory space.  sync() does _nothing_ if the
+    // Views on the two memory spaces are in sync.  DualView
+    // determines this by the user manually marking one side or the
+    // other as modified; see the modify() call below.
+
+    dv_idx.sync<memory_space> ();
+    dv_dest.template sync<memory_space> ();
+    dv_src.template sync<memory_space> ();
+
+    // Mark dest as modified on Device.
+    dv_dest.template modify<memory_space> ();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const {
+    double tmp = 0.0;
+    for (int j = 0; j < (int) idx.extent(1); ++j) {
+      const double val = src(idx(i,j));
+      tmp += val*val + 0.5*(idx.extent(0)*val -idx.extent(1)*val);
+    }
+    dest(i) += tmp;
+  }
+};
+
+class ParticleType {
+  public:
+    double q;
+    double m;
+    double q_over_m;
+    KOKKOS_INLINE_FUNCTION
+    ParticleType(double q_ = -1, double m_ = 1):
+     q(q_), m(m_), q_over_m(q/m) {}
+protected:
+};
+
+  typedef Kokkos::DualView<ParticleType[10]> ParticleTypes;
+int main (int narg, char* arg[]) {
+  Kokkos::initialize (narg, arg);
+
+// If View is non-trivial constructible type then add braces so it is out of scope
+// before Kokkos::finalize() call
+{
+  ParticleTypes test("Test");
+  Kokkos::fence();
+  test.h_view(0) = ParticleType(-1e4,1);
+  Kokkos::fence();
+
+  int size = 1000000;
+
+  // Create DualViews. This will allocate on both the device and its
+  // host_mirror_device.
+  idx_type idx ("Idx",size,64);
+  view_type dest ("Dest",size);
+  view_type src ("Src",size);
+
+
+  srand (134231);
+
+  // Get a reference to the host view of idx directly (equivalent to
+  // idx.view<idx_type::host_mirror_space>() )
+  idx_type::t_host h_idx = idx.h_view;
+  for (int i = 0; i < size; ++i) {
+    for (view_type::size_type j = 0; j < h_idx.extent(1); ++j) {
+      h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
+    }
+  }
+
+  // Mark idx as modified on the host_mirror_space so that a
+  // sync to the device will actually move data.  The sync happens in
+  // the functor's constructor.
+  idx.modify<idx_type::host_mirror_space> ();
+
+  // Run on the device.  This will cause a sync of idx to the device,
+  // since it was marked as modified on the host.
+  Kokkos::Timer timer;
+  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec1_dev = timer.seconds();
+
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec2_dev = timer.seconds();
+
+  // Run on the host's default execution space (could be the same as device).
+  // This will cause a sync back to the host of dest.  Note that if the Device is CUDA,
+  // the data layout will not be optimal on host, so performance is
+  // lower than what it would be for a pure host compilation.
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec1_host = timer.seconds();
+
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec2_host = timer.seconds();
+
+  printf("Device Time with Sync: %f without Sync: %f \n",sec1_dev,sec2_dev);
+  printf("Host   Time with Sync: %f without Sync: %f \n",sec1_host,sec2_host);
+}
+
+  Kokkos::finalize();
+}
+
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f0ed569f9f48a02ebcca091adced52a8c3a1f2ad
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF (Kokkos_ENABLE_Cuda_UVM)
+  # This is a tutorial, not a test, so we don't ask CTest to run it.
+  TRIBITS_ADD_EXECUTABLE(
+    tutorial_advancedviews_05_nvidia_uvm
+    SOURCES uvm_example.cpp
+    COMM serial mpi
+    DEPLIBS kokkoscontainers kokkoscore
+    )
+ENDIF ()
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..ffd81843041e169cdea6f719190ab42ad0f261bc
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile
@@ -0,0 +1,48 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 05_NVIDIA_UVM.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 05_NVIDIA_UVM.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b322dcad358f3c3a987f41d64f4eee65682a6c01
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
@@ -0,0 +1,140 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DualView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+#include <cstdlib>
+
+#ifdef KOKKOS_ENABLE_CUDA
+typedef Kokkos::View<double*, Kokkos::CudaUVMSpace> view_type;
+typedef Kokkos::View<int**, Kokkos::CudaUVMSpace> idx_type;
+#else
+typedef Kokkos::View<double*,Kokkos::HostSpace> view_type;
+typedef Kokkos::View<int**,Kokkos::HostSpace> idx_type;
+#endif
+
+template<class Device>
+struct localsum {
+  // Define the execution space for the functor (overrides the DefaultExecutionSpace)
+  typedef Device execution_space;
+
+  // Get the view types on the particular device the functor is instantiated for
+  idx_type::const_type idx;
+  view_type dest;
+  Kokkos::View<view_type::const_data_type, view_type::array_layout, view_type::device_type, Kokkos::MemoryRandomAccess > src;
+
+  localsum(idx_type idx_, view_type dest_,
+      view_type src_):idx(idx_),dest(dest_),src(src_) {
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i) const {
+    double tmp = 0.0;
+    for(int j = 0; j < int(idx.extent(1)); j++) {
+      const double val = src(idx(i,j));
+      tmp += val*val + 0.5*(idx.extent(0)*val -idx.extent(1)*val);
+    }
+    dest(i) += tmp;
+  }
+};
+
+int main(int narg, char* arg[]) {
+  Kokkos::initialize(narg,arg);
+
+  int size = 1000000;
+
+  // Create Views
+  idx_type idx("Idx",size,64);
+  view_type dest("Dest",size);
+  view_type src("Src",size);
+
+  srand(134231);
+
+  Kokkos::fence();
+
+  // When using UVM Cuda views can be accessed on the Host directly
+  for(int i=0; i<size; i++) {
+    for(int j=0; j<int(idx.extent(1)); j++)
+      idx(i,j) = (size + i + (rand()%500 - 250))%size;
+  }
+
+  Kokkos::fence();
+  // Run on the device
+  // This will cause a sync of idx to the device since it was modified on the host
+  Kokkos::Timer timer;
+  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec1_dev = timer.seconds();
+
+  // No data transfer will happen now, since nothing is accessed on the host
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec2_dev = timer.seconds();
+
+  // Run on the host
+  // This will cause a sync back to the host of dest which was changed on the device
+  // Compare runtime here with the dual_view example: dest will be copied back in 4k blocks
+  // when they are accessed the first time during the parallel_for. Due to the latency of a memcpy
+  // this gives lower effective bandwidth when doing a manual copy via dual views
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec1_host = timer.seconds();
+
+  // No data transfers will happen now
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec2_host = timer.seconds();
+
+
+
+  printf("Device Time with Sync: %e without Sync: %e \n",sec1_dev,sec2_dev);
+  printf("Host   Time with Sync: %e without Sync: %e \n",sec1_host,sec2_host);
+
+  Kokkos::finalize();
+}
+
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..725d0de0e29be0bf89ebcca51828c65eeecaa1a1
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 06_AtomicViews.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 06_AtomicViews.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 06_AtomicViews.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..8983b46d600eb62b19e6f6b6e661212106f1c509
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile
@@ -0,0 +1,48 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3 --default-stream per-thread 
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 07_Overlapping_DeepCopy.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 07_Overlapping_DeepCopy.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad1503642bf348a8447dd6320e56ea26b11ed016
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
@@ -0,0 +1,148 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <typeinfo>
+#include <cmath>
+#include <impl/Kokkos_Timer.hpp>
+
+struct FillDevice {
+  double value;
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> a;
+  FillDevice(const double& val, const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_a):
+     value(val),a(d_a){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    a(i) = value;
+  }
+};
+
+struct ComputeADevice {
+  int iter;
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> a;
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> b;
+  ComputeADevice(const int& iter_,
+                 const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_a,
+                 const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_b):
+     iter(iter_),a(d_a),b(d_b){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    for(int j=1;j<iter;j++) {
+      a(i) += std::pow(b(i),1.0+1.0/iter);
+    }
+  }
+};
+
+struct ComputeAHost {
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> a;
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> b;
+  ComputeAHost(  const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace>& d_a,
+                 const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace>& d_b):
+     a(d_a),b(d_b){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    a(i) += b(i);
+  }
+};
+
+struct MergeDevice {
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> a;
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> b;
+  MergeDevice(
+                 const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_a,
+                 const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_b):
+     a(d_a),b(d_b){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    a(i) += b(i);
+  }
+};
+
+int main(int argc, char * argv[]) {
+  int size = 100000000;
+  Kokkos::initialize();
+  int synch = atoi(argv[1]);
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> d_a("Device A",size);
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> d_b("Device B",size);
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> d_tmp("Device tmp",size);
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> h_a("Host A",size);
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> h_b("Host B",size);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),FillDevice(0.0,d_a));
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),FillDevice(1.3513,d_b));
+  Kokkos::fence();
+  Kokkos::Timer timer;
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),ComputeADevice(20,d_a,d_b));
+
+  if(synch==1)
+    Kokkos::deep_copy(Kokkos::OpenMP(),h_b,d_b);
+  if(synch==2)
+    Kokkos::deep_copy(h_b,d_b);
+
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0,size),[=] (const int& i) {
+    h_a(i) = 0.0;
+  });
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0,size),ComputeAHost(h_a,h_b));
+  Kokkos::OpenMP::fence();
+  if(synch==1)
+    Kokkos::deep_copy(Kokkos::OpenMP(), d_tmp,h_a);
+  if(synch==2)
+    Kokkos::deep_copy(d_tmp,h_a);
+  Kokkos::fence();
+
+  std::cout << "Time " << timer.seconds() << std::endl;
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),MergeDevice(d_a,d_tmp));
+
+  Kokkos::deep_copy(h_a,d_a);
+  std::cout << "h_a(0): " << h_a(0) << " ( Correct: 27.4154 )" << std::endl;
+  Kokkos::finalize();
+}
+
+
+
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt b/packages/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f4f1addc5553d9ce7131456f02af664554757daa
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt
@@ -0,0 +1,9 @@
+
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_data_layouts)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_memory_traits)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_subviews)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(04_dualviews)
+
+IF (Kokkos_ENABLE_Cuda_UVM)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(05_NVIDIA_UVM)
+ENDIF ()
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/Makefile b/packages/kokkos/example/tutorial/Advanced_Views/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ac5652e5798c11f2285e4294fcc88ce771093e
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Advanced_Views/Makefile
@@ -0,0 +1,123 @@
+ifndef KOKKOS_PATH
+  MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+  KOKKOS_PATH = $(subst Makefile,,$(MAKEFILE_PATH))../../..
+endif
+
+ifndef KOKKOS_SETTINGS
+  KOKKOS_SETTINGS = "KOKKOS_PATH=${KOKKOS_PATH}"
+  ifdef KOKKOS_ARCH
+    KOKKOS_SETTINGS += "KOKKOS_ARCH=${KOKKOS_ARCH}"
+  endif
+  ifdef KOKKOS_DEVICES
+    KOKKOS_SETTINGS += "KOKKOS_DEVICES=${KOKKOS_DEVICES}"
+  endif
+  ifdef KOKKOS_OPTIONS
+    KOKKOS_SETTINGS += "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}"
+  endif
+  ifdef KOKKOS_CUDA_OPTIONS
+    KOKKOS_SETTINGS += "KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPTIONS}"
+  endif
+endif
+
+build:
+	mkdir -p 01_data_layouts
+	cd ./01_data_layouts; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 02_memory_traits
+	cd ./02_memory_traits; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 03_subviews
+	cd ./03_subviews; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 04_dualviews
+	cd ./04_dualviews; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 05_NVIDIA_UVM
+	cd ./05_NVIDIA_UVM; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS}
+	#mkdir -p 06_AtomicViews
+	#cd ./06_AtomicViews; \
+	#$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS}
+	#mkdir -p 07_Overlapping_DeepCopy
+	#cd ./07_Overlapping_DeepCopy; \
+	#$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS}
+
+build-insource:
+	cd ./01_data_layouts; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./02_memory_traits; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./03_subviews; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./04_dualviews; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./05_NVIDIA_UVM; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	#cd ./06_AtomicViews; \
+	#$(MAKE) build ${KOKKOS_SETTINGS}
+	#cd ./07_Overlapping_DeepCopy; \
+	#$(MAKE) build ${KOKKOS_SETTINGS}
+
+test:
+	cd ./01_data_layouts; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS}
+	cd ./02_memory_traits; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS}
+	cd ./03_subviews; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS}
+	cd ./04_dualviews; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS}
+	cd ./05_NVIDIA_UVM; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS}
+	#cd ./06_AtomicViews; \
+	#$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS}
+	#cd ./07_Overlapping_DeepCopy; \
+	#$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS}
+
+test-insource:
+	cd ./01_data_layouts; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./02_memory_traits; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./03_subviews; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./04_dualviews; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./05_NVIDIA_UVM; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	#cd ./06_AtomicViews; \
+	#$(MAKE) test ${KOKKOS_SETTINGS}
+	#cd ./07_Overlapping_DeepCopy; \
+	#$(MAKE) test ${KOKKOS_SETTINGS}
+
+clean:
+	cd ./01_data_layouts; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS}
+	cd ./02_memory_traits; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS}
+	cd ./03_subviews; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS}
+	cd ./04_dualviews; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS}
+	cd ./05_NVIDIA_UVM; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS}
+	#cd ./06_AtomicViews; \
+	#$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS}
+	#cd ./07_Overlapping_DeepCopy; \
+	#$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS}
+
+clean-insource:
+	cd ./01_data_layouts; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./02_memory_traits; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./03_subviews; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./04_dualviews; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./05_NVIDIA_UVM; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	#cd ./06_AtomicViews; \
+	#$(MAKE) clean ${KOKKOS_SETTINGS}
+	#cd ./07_Overlapping_DeepCopy; \
+	#$(MAKE) clean ${KOKKOS_SETTINGS}
diff --git a/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..386a87474d38affd7498d70b14713ffe746db252
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Algorithms/01_random_numbers/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 01_random_numbers.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 01_random_numbers.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 01_random_numbers.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48b0d86cb84d9f32d262efb7488fc01f43e52e6e
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
@@ -0,0 +1,154 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <Kokkos_DualView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdlib>
+
+typedef Kokkos::HostSpace::execution_space DefaultHostType;
+
+// Kokkos provides two different random number generators with a 64 bit and a 1024 bit state.
+// These generators are based on Vigna, Sebastiano (2014). "An experimental exploration of Marsaglia's xorshift generators, scrambled"
+// See: http://arxiv.org/abs/1402.6246
+// The generators can be used fully independently on each thread and have been tested to
+// produce good statistics for both inter and intra thread numbers.
+// Note that within a kernel NO random number operations are (team) collective operations.
+// Everything can be called within branches. This is a difference to the curand library where
+// certain operations are required to be called by all threads in a block.
+//
+// In Kokkos you are required to create a pool of generator states, so that threads can
+// grep their own. On CPU architectures the pool size is equal to the thread number,
+// on CUDA about 128k states are generated (enough to give every potentially simultaneously
+// running thread its own state). With a kernel a thread is required to aquire a state from the
+// pool and later return it.
+// On CPUs the Random number generator is deterministic if using the same number of threads.
+// On GPUs (i.e. using the CUDA backend it is not deterministic because threads aquire states via
+// atomics.
+
+// A Functor for generating uint64_t random numbers templated on the GeneratorPool type
+template<class GeneratorPool>
+struct generate_random {
+
+
+  // Output View for the random numbers
+  Kokkos::View<uint64_t*> vals;
+  
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+  
+  int samples;
+
+  // Initialize all members
+  generate_random(Kokkos::View<uint64_t*> vals_,
+                       GeneratorPool rand_pool_,
+                       int samples_):
+                       vals(vals_),rand_pool(rand_pool_),samples(samples_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    // Draw samples numbers from the pool as urand64 between 0 and rand_pool.MAX_URAND64
+    // Note there are function calls to get other type of scalars, and also to specify
+    // Ranges or get a normal distributed float.
+    for(int k = 0;k<samples;k++)
+      vals(i*samples+k) = rand_gen.urand64();
+
+    // Give the state back, which will allow another thread to aquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+
+
+
+int main(int argc, char* args[]) {
+  if (argc != 3){
+	printf("Please pass two integers on the command line\n");
+  }
+  else {
+
+  // Initialize Kokkos
+  Kokkos::initialize(argc,args);
+  int size = atoi(args[1]);
+  int samples = atoi(args[2]);
+
+  // Create two random number generator pools one for 64bit states and one for 1024 bit states
+  // Both take an 64 bit unsigned integer seed to initialize a Random_XorShift64 generator which
+  // is used to fill the generators of the pool.
+  Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
+  Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857);
+  Kokkos::DualView<uint64_t*> vals("Vals",size*samples);
+
+  // Run some performance comparisons
+  Kokkos::Timer timer;
+  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples));
+  Kokkos::fence();
+
+  timer.reset();
+  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples));
+  Kokkos::fence();
+  double time_64 = timer.seconds();
+
+  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples));
+  Kokkos::fence();
+
+  timer.reset();
+  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples));
+  Kokkos::fence();
+  double time_1024 = timer.seconds();
+
+  printf("#Time XorShift64*:   %e %e\n",time_64,1.0e-9*samples*size/time_64 );
+  printf("#Time XorShift1024*: %e %e\n",time_1024,1.0e-9*samples*size/time_1024 );
+
+  Kokkos::deep_copy(vals.h_view,vals.d_view);
+
+  Kokkos::finalize();
+  }
+  return 0;
+}
+
+
diff --git a/packages/kokkos/example/tutorial/Algorithms/Makefile b/packages/kokkos/example/tutorial/Algorithms/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4e70ba7d976fe5e364049bd46eae3b7f2c9b1153
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Algorithms/Makefile
@@ -0,0 +1,43 @@
+ifndef KOKKOS_PATH
+  MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+  KOKKOS_PATH = $(subst Makefile,,$(MAKEFILE_PATH))../../..
+endif
+
+ifndef KOKKOS_SETTINGS
+  KOKKOS_SETTINGS = "KOKKOS_PATH=${KOKKOS_PATH}"
+  ifdef KOKKOS_ARCH
+    KOKKOS_SETTINGS += "KOKKOS_ARCH=${KOKKOS_ARCH}"
+  endif
+  ifdef KOKKOS_DEVICES
+    KOKKOS_SETTINGS += "KOKKOS_DEVICES=${KOKKOS_DEVICES}"
+  endif
+  ifdef KOKKOS_OPTIONS
+    KOKKOS_SETTINGS += "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}"
+  endif
+  ifdef KOKKOS_CUDA_OPTIONS
+    KOKKOS_SETTINGS += "KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPTIONS}"
+  endif
+endif
+
+build:
+	mkdir -p 01_random_numbers
+	cd ./01_random_numbers; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS}
+
+build-insource:
+	cd ./01_random_numbers; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+test:
+	cd ./01_random_numbers; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS}
+
+test-insource:
+	cd ./01_random_numbers; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+clean:
+	cd ./01_random_numbers; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS}
+
+clean-insource:
+	cd ./01_random_numbers; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
diff --git a/packages/kokkos/example/tutorial/CMakeLists.txt b/packages/kokkos/example/tutorial/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..613e460cad3c9762f1fb754693d4cd287c7b7a1e
--- /dev/null
+++ b/packages/kokkos/example/tutorial/CMakeLists.txt
@@ -0,0 +1,18 @@
+
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_hello_world)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_simple_view)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(04_simple_memoryspaces)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(05_simple_atomics)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(06_simple_mdrangepolicy)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(Advanced_Views)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(Hierarchical_Parallelism)
+
+IF (Kokkos_ENABLE_CXX11)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_hello_world_lambda)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce_lambda)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_simple_view_lambda)
+ENDIF ()
+
+
+
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2d8a514a4549aad63f735721b41e47516a570070
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_hierarchicalparallelism_01_thread_teams
+  SOURCES thread_teams.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..7282abc30cd4632d2aaa4e218286c6e65f1c4fa9
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 01_thread_teams.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 01_thread_teams.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 01_thread_teams.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff3002e645d265765fa38b437fbe0711d5408d78
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp
@@ -0,0 +1,94 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// Using default execution space define a TeamPolicy and its member_type
+// The member_type is what the operator of a functor or Lambda gets, for
+// a simple RangePolicy the member_type is simply an integer
+// For a TeamPolicy its a much richer object, since it provides all information
+// to identify a thread uniquely and some team related function calls such as a
+// barrier (which will be used in a subsequent example).
+// A ThreadTeam consists of 1 to n threads where the maxmimum value of n is
+// determined by the hardware. On a dual socket CPU machine with 8 cores per socket
+// the maximum size of a team is 8. The number of teams (i.e. the league_size) is
+// not limited by physical constraints. Its a pure logical number.
+
+typedef Kokkos::TeamPolicy<>              team_policy ;
+typedef team_policy::member_type team_member ;
+
+// Define a functor which can be launched using the TeamPolicy
+struct hello_world {
+  typedef int value_type; //Specify value type for reduction target, sum
+
+  // This is a reduction operator which now takes as first argument the
+  // TeamPolicy member_type. Every member of the team contributes to the
+  // total sum.
+  // It is helpful to think of this operator as a parallel region for a team
+  // (i.e. every team member is active and will execute the code).
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const team_member & thread, int& sum) const {
+    sum+=1;
+    // The TeamPolicy<>::member_type provides functions to query the multi
+    // dimensional index of a thread as well as the number of thread-teams and the size
+    // of each team.
+    printf("Hello World: %i %i // %i %i\n",thread.league_rank(),thread.team_rank(),thread.league_size(),thread.team_size());
+  }
+};
+
+int main(int narg, char* args[]) {
+  Kokkos::initialize(narg,args);
+
+  // Launch 12 teams of the maximum number of threads per team
+  const team_policy policy( 12 , team_policy::team_size_max( hello_world() ) );
+  
+  int sum = 0;
+  Kokkos::parallel_reduce( policy , hello_world() , sum );
+
+  // The result will be 12*team_policy::team_size_max( hello_world())
+  printf("Result %i\n",sum);
+
+  Kokkos::finalize();
+}
+
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ec7f1e1159fcf7f12209defea154c494fb48540e
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF (Kokkos_ENABLE_CXX11)
+  # This is a tutorial, not a test, so we don't ask CTest to run it.
+  TRIBITS_ADD_EXECUTABLE(
+    tutorial_hierarchical_01_thread_teams_lambda
+    SOURCES thread_teams_lambda.cpp
+    COMM serial mpi
+    )
+ENDIF ()
+
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4049dbde34729191e59940e2a5ddaba92ff4d99b
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile
@@ -0,0 +1,59 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 01_thread_teams_lambda.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS += "enable_lambda"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 01_thread_teams_lambda.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 01_thread_teams_lambda.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d936699a5f9b165600283a193129894f4d6e8ba4
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
@@ -0,0 +1,97 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// Demonstrate a parallel reduction using thread teams (TeamPolicy).
+//
+// A thread team consists of 1 to n threads.  The hardware determines
+// the maxmimum value of n. On a dual-socket CPU machine with 8 cores
+// per socket, the maximum size of a team is 8. The number of teams
+// (the league_size) is not limited by physical constraints (up to
+// some reasonable bound, which eventually depends upon the hardware
+// and programming model implementation).
+
+int main (int narg, char* args[]) {
+  using Kokkos::parallel_reduce;
+  typedef Kokkos::TeamPolicy<>               team_policy;
+  typedef typename team_policy::member_type  team_member;
+
+  Kokkos::initialize (narg, args);
+
+  // Set up a policy that launches 12 teams, with the maximum number
+  // of threads per team.
+
+  const team_policy policy (12, Kokkos::AUTO);
+
+  // This is a reduction with a team policy.  The team policy changes
+  // the first argument of the lambda.  Rather than an integer index
+  // (as with RangePolicy), it's now TeamPolicy::member_type.  This
+  // object provides all information to identify a thread uniquely.
+  // It also provides some team-related function calls such as a team
+  // barrier (which a subsequent example will use).
+  //
+  // Every member of the team contributes to the total sum.  It is
+  // helpful to think of the lambda's body as a "team parallel
+  // region."  That is, every team member is active and will execute
+  // the body of the lambda.
+  int sum = 0;
+  // We also need to protect the usage of a lambda against compiling
+  // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
+  #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  parallel_reduce (policy, KOKKOS_LAMBDA (const team_member& thread, int& lsum) {
+      lsum += 1;
+      // TeamPolicy<>::member_type provides functions to query the
+      // multidimensional index of a thread, as well as the number of
+      // thread teams and the size of each team.
+      printf ("Hello World: %i %i // %i %i\n", thread.league_rank (),
+              thread.team_rank (), thread.league_size (), thread.team_size ());
+    }, sum);
+  #endif
+  // The result will be 12*team_policy::team_size_max([=]{})
+  printf ("Result %i\n",sum);
+
+  Kokkos::finalize ();
+}
+
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e660405345167858b985261362d6135d5e6d5c4d
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_hierarchicalparallelism_02_nested_parallel_for
+  SOURCES nested_parallel_for.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..fe882f36b8ed21f8565bdafd36cb855234b9d014
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 02_nested_parallel_for.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 02_nested_parallel_for.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 02_nested_parallel_for.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..721aab2d3e382fcb92f51f1845b9c7aaa21b0d5e
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp
@@ -0,0 +1,89 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// See 01_thread_teams for an explanation of a basic TeamPolicy
+typedef Kokkos::TeamPolicy<>              team_policy ;
+typedef typename team_policy::member_type team_member ;
+
+struct hello_world {
+  typedef int value_type; //Specify value type for reduction target, sum
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const team_member & thread, int& sum) const {
+    sum+=1;
+    // When using the TeamPolicy Kokkos allows for nested parallel loops.
+    // All three Kokkos parallel patterns are allowed (for, reduce, scan) and they
+    // largely follow the same syntax as on the global level.
+    // The execution policy for the Thread level nesting (the Vector level is in the next
+    // tutorial example) is Kokkos::TeamThreadRange. This means the loop will be executed
+    // by all members of the team and the loop count will be split between threads of the
+    // team. Its arguments are the team_member, and a loop count.
+    // Not every thread will do the same amount of iterations. On a GPU for example with
+    // a team_size() larger than 31 only the first 31 threads would actually do anything.
+    // On a CPU with 8 threads 7 would execute 4 loop iterations, and 1 thread would do
+    // 3. Note also that the mode of splitting the count is architecture dependent similar
+    // to what the RangePolicy on a global level does.
+    // The call itself is not guaranteed to be synchronous. Also keep in mind that the
+    // operator using a team_policy acts like a parallel region for the team. That means
+    // that everything outside of the nested parallel_for is also executed by all threads
+    // of the team.
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,31), [&] (const int& i) {
+       printf("Hello World: (%i , %i) executed loop %i \n",thread.league_rank(),thread.team_rank(),i);
+    });
+  }
+};
+
+int main(int narg, char* args[]) {
+  Kokkos::initialize(narg,args);
+
+  // Launch 3 teams of the maximum number of threads per team
+  const team_policy policy( 3 , team_policy::team_size_max( hello_world() ) );
+  
+  int sum = 0;
+  Kokkos::parallel_reduce( policy , hello_world() , sum );
+  printf("Result %i\n",sum);
+
+  Kokkos::finalize();
+}
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ea6b0b1e42694c2b0b5994b54309e19647a09e5f
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt
@@ -0,0 +1,16 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+
+IF(Kokkos_ENABLE_CXX11)
+
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_hierarchicalparallelism_03_vectorization
+  SOURCES vectorization.cpp
+  COMM serial mpi
+  )
+
+ENDIF()
+
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4481889cdb020198e893fabd25182a15fce28bfe
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 03_vectorization.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 03_vectorization.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 03_vectorization.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c6491cafc3f264d22841e0d8385a04503381692
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp
@@ -0,0 +1,160 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <cstdio>
+
+
+// The TeamPolicy actually supports 3D parallelism: Teams, Threads, Vector
+// Kokkos::parallel_{for/reduce/scan} calls can be completely free nested.
+// The execution policies for the nested layers are TeamThreadRange and
+// ThreadVectorRange.
+// The only restriction on nesting is that a given level can only be nested in a
+// higher one. e.g. a ThreadVectorRange can be nested inside a TeamPolicy operator
+// and inside a TeamThreadRange, but you can not nest a ThreadVectorRange or a
+// TeamThreadRange inside another ThreadVectorRange.
+// As with the 2D execution of TeamPolicy the operator has to be considered as
+// a parallel region even with respect to VectorLanes. That means even outside
+// a TeamThread or VectorThread loop all threads of a team and all vector lanes
+// of a thread execute every line of the operator as long as there are no restricitons
+// on them.
+// Code lines can be restricted using Kokkos::single to either execute once PerThread
+// or execute once PerTeam.
+typedef typename Kokkos::TeamPolicy<>::member_type team_member ;
+
+struct SomeCorrelation {
+  typedef int value_type; //Specify value type for reduction target, sum
+  typedef Kokkos::DefaultExecutionSpace::scratch_memory_space shared_space;
+  typedef Kokkos::View<int*,shared_space,Kokkos::MemoryUnmanaged> shared_1d_int;
+
+  Kokkos::View<const int***,Kokkos::LayoutRight> data;
+  Kokkos::View<int> gsum;
+
+  SomeCorrelation(Kokkos::View<int***,Kokkos::LayoutRight> data_in,
+                  Kokkos::View<int> sum):data(data_in),gsum(sum){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const team_member & thread) const {
+    int i = thread.league_rank();
+
+    // Allocate a shared array for the team.
+    shared_1d_int count(thread.team_shmem(),data.extent(1));
+
+    // With each team run a parallel_for with its threads
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.extent(1)), [=] (const int& j) {
+      int tsum;
+      // Run a vector loop reduction over the inner dimension of data
+      // Count how many values are multiples of 4
+      // Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes
+      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.extent(2)), [=] (const int& k, int & vsum) {
+        vsum+= (data(i,j,k) % 4 == 0)?1:0;
+      },tsum);
+
+      // Make sure only one vector lane adds the reduction value to the shared array, i.e. execute
+      // the next line only once PerThread
+      Kokkos::single(Kokkos::PerThread(thread),[=] () {
+        count(j) = tsum;
+      });
+    });
+
+    // Wait for all threads to finish the parallel_for so that all shared memory writes are done
+    thread.team_barrier();
+
+    // Check with one vector lane from each thread how many consecutive
+    // data segments have the same number of values divisible by 4
+    // The team reduction value is again broadcast to every team member (and every vector lane)
+    int team_sum = 0;
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.extent(1)-1), [=] (const int& j, int& thread_sum) {
+      // It is not valid to directly add to thread_sum
+      // Use a single function with broadcast instead
+      // team_sum will be used as input to the operator (i.e. it is used to initialize sum)
+      // the end value of sum will be broadcast to all vector lanes in the thread.
+      Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) {
+        if(count(j)==count(j+1)) sum++;
+      },thread_sum);
+    },team_sum);
+
+    // Add with one thread and vectorlane of the team the team_sum to the global value
+    Kokkos::single(Kokkos::PerTeam(thread),[=] () {
+      Kokkos::atomic_add(&gsum(),team_sum);
+    });
+  }
+
+  // The functor needs to define how much shared memory it requests given a team_size.
+  size_t team_shmem_size( int team_size ) const {
+    return shared_1d_int::shmem_size(data.extent(1));
+  }
+};
+
+int main(int narg, char* args[]) {
+  Kokkos::initialize(narg,args);
+
+  // Produce some 3D random data (see Algorithms/01_random_numbers for more info)
+  Kokkos::View<int***,Kokkos::LayoutRight> data("Data",512,512,32);
+  Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
+  Kokkos::fill_random(data,rand_pool64,100);
+
+  // A global value to put the result in
+  Kokkos::View<int> gsum("Sum");
+
+  // Each team handles a slice of the data
+  // Set up TeamPolicy with 512 teams with maximum number of threads per team and 16 vector lanes.
+  // Kokkos::AUTO will determine the number of threads
+  // The maximum vector length is hardware dependent but can always be smaller than the hardware allows.
+  // The vector length must be a power of 2.
+
+  const Kokkos::TeamPolicy<> policy( 512 , Kokkos::AUTO , 16);
+
+  Kokkos::parallel_for( policy , SomeCorrelation(data,gsum) );
+
+  Kokkos::fence();
+
+  // Copy result value back
+  int sum = 0;
+  Kokkos::deep_copy(sum,gsum);
+  printf("Result %i\n",sum);
+
+  Kokkos::finalize();
+}
+
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..15ad5d780340dd0e10c338530f7c88222e742169
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_hierarchicalparallelism_04_team_scan
+  SOURCES team_scan.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..0f0bcf70def3de1d722bc28be25688f62d6bccd2
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile
@@ -0,0 +1,58 @@
+KOKKOS_PATH = ../../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = 04_team_scan.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = 04_team_scan.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = 04_team_scan.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..951e09bd253540562bcf235149297e082f2fcdfe
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
@@ -0,0 +1,144 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DualView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+#include <cstdlib>
+
+typedef Kokkos::DefaultExecutionSpace       Device ;
+typedef Kokkos::HostSpace::execution_space  Host ;
+
+typedef Kokkos::TeamPolicy< Device >      team_policy ;
+typedef team_policy::member_type team_member ;
+
+static const int TEAM_SIZE = 16 ;
+
+struct find_2_tuples {
+  int chunk_size;
+  Kokkos::View<const int*> data;
+  Kokkos::View<int**> histogram;
+
+  find_2_tuples(int chunk_size_, Kokkos::DualView<int*> data_,
+                Kokkos::DualView<int**> histogram_):chunk_size(chunk_size_),
+                data(data_.d_view),histogram(histogram_.d_view) {
+      data_.sync<Device>();
+      histogram_.sync<Device>();
+      histogram_.modify<Device>();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const team_member & dev) const {
+    Kokkos::View<int**,Kokkos::MemoryUnmanaged> l_histogram(dev.team_shmem(),TEAM_SIZE,TEAM_SIZE);
+    Kokkos::View<int*,Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),chunk_size+1);
+
+    const int i = dev.league_rank() * chunk_size;
+    for(int j = dev.team_rank(); j<chunk_size+1; j+=dev.team_size())
+      l_data(j) = data(i+j);
+
+    for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
+      for(int l = 0; l < TEAM_SIZE; l++)
+        l_histogram(k,l) = 0;
+    dev.team_barrier();
+
+    for(int j = 0; j<chunk_size; j++) {
+      for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
+        for(int l = 0; l < TEAM_SIZE; l++) {
+          if((l_data(j) == k) && (l_data(j+1)==l))
+            l_histogram(k,l)++;
+        }
+    }
+
+    for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
+      for(int l = 0; l < TEAM_SIZE; l++) {
+        Kokkos::atomic_fetch_add(&histogram(k,l),l_histogram(k,l));
+      }
+    dev.team_barrier();
+  }
+  size_t team_shmem_size( int team_size ) const { 
+    return Kokkos::View<int**,Kokkos::MemoryUnmanaged>::shmem_size(TEAM_SIZE,TEAM_SIZE) +
+           Kokkos::View<int*,Kokkos::MemoryUnmanaged>::shmem_size(chunk_size+1);
+  }
+};
+
+int main(int narg, char* args[]) {
+  Kokkos::initialize(narg,args);
+  
+  int chunk_size = 1024;
+  int nchunks = 100000; //1024*1024;
+  Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);
+
+  srand(1231093);
+
+  for(int i = 0; i < (int) data.dimension_0(); i++) {
+    data.h_view(i) = rand()%TEAM_SIZE;
+  }
+  data.modify<Host>();
+  data.sync<Device>();
+
+  Kokkos::DualView<int**> histogram("histogram",TEAM_SIZE,TEAM_SIZE);
+
+
+  Kokkos::Timer timer;
+  // threads/team is automatically limited to maximum supported by the device.
+  Kokkos::parallel_for( team_policy( nchunks , TEAM_SIZE )
+                      , find_2_tuples(chunk_size,data,histogram) );
+  Kokkos::fence();
+  double time = timer.seconds();
+
+  histogram.sync<Host>();
+
+  printf("Time: %f \n\n",time);
+  int sum = 0;
+  for(int k=0; k<TEAM_SIZE; k++) {
+    for(int l=0; l<TEAM_SIZE; l++) {
+      printf("%i ",histogram.h_view(k,l));
+      sum += histogram.h_view(k,l);
+    }
+    printf("\n");
+  }
+  printf("Result: %i %i\n",sum,chunk_size*nchunks);
+  Kokkos::finalize();
+}
+
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e03d7aeb901871aec70c712808dea9c322cd6176
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt
@@ -0,0 +1,8 @@
+
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams)
+
+IF (Kokkos_ENABLE_CXX11)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams_lambda)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_nested_parallel_for)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_vectorization)
+ENDIF ()
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4bf6d487ae977ca6bd42e9f5787314bf4fd8bbe7
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile
@@ -0,0 +1,95 @@
+ifndef KOKKOS_PATH
+  MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+  KOKKOS_PATH = $(subst Makefile,,$(MAKEFILE_PATH))../../..
+endif
+
+ifndef KOKKOS_SETTINGS
+  KOKKOS_SETTINGS = "KOKKOS_PATH=${KOKKOS_PATH}"
+  ifdef KOKKOS_ARCH
+    KOKKOS_SETTINGS += "KOKKOS_ARCH=${KOKKOS_ARCH}"
+  endif
+  ifdef KOKKOS_DEVICES
+    KOKKOS_SETTINGS += "KOKKOS_DEVICES=${KOKKOS_DEVICES}"
+  endif
+  ifdef KOKKOS_OPTIONS
+    KOKKOS_SETTINGS += "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}"
+  endif
+  ifdef KOKKOS_CUDA_OPTIONS
+    KOKKOS_SETTINGS += "KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPTIONS}"
+  endif
+endif
+
+build:
+	mkdir -p 01_thread_teams
+	cd ./01_thread_teams; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 01_thread_teams_lambda
+	cd ./01_thread_teams_lambda; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 02_nested_parallel_for
+	cd ./02_nested_parallel_for; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 03_vectorization
+	cd ./03_vectorization; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 04_team_scan
+	cd ./04_team_scan; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS}
+
+build-insource:
+	cd ./01_thread_teams; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./01_thread_teams_lambda; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./02_nested_parallel_for; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./03_vectorization; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./04_team_scan; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+test:
+	cd ./01_thread_teams; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS}
+	cd ./01_thread_teams_lambda; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS}
+	cd ./02_nested_parallel_for; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS}
+	cd ./03_vectorization; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS}
+	cd ./04_team_scan; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS}
+
+test-insource:
+	cd ./01_thread_teams; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./01_thread_teams_lambda; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./02_nested_parallel_for; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./03_vectorization; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./04_team_scan; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+clean:
+	cd ./01_thread_teams; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS}
+	cd ./01_thread_teams_lambda; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS}
+	cd ./02_nested_parallel_for; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS}
+	cd ./03_vectorization; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS}
+	cd ./04_team_scan; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS}
+
+clean-insource:
+	cd ./01_thread_teams; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./01_thread_teams_lambda; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./02_nested_parallel_for; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./03_vectorization; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./04_team_scan; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
diff --git a/packages/kokkos/example/tutorial/Makefile b/packages/kokkos/example/tutorial/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..7b2732eeedc2c91f5648aeacfb2aa27817e1fae0
--- /dev/null
+++ b/packages/kokkos/example/tutorial/Makefile
@@ -0,0 +1,174 @@
+
+ifndef KOKKOS_PATH
+  MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+  KOKKOS_PATH = $(subst Makefile,,$(MAKEFILE_PATH))../..
+endif
+
+ifndef KOKKOS_SETTINGS
+  KOKKOS_SETTINGS = "KOKKOS_PATH=${KOKKOS_PATH}"
+  ifdef KOKKOS_ARCH
+    KOKKOS_SETTINGS += "KOKKOS_ARCH=${KOKKOS_ARCH}"
+  endif
+  ifdef KOKKOS_DEVICES
+    KOKKOS_SETTINGS += "KOKKOS_DEVICES=${KOKKOS_DEVICES}"
+  endif
+  ifdef KOKKOS_OPTIONS
+    KOKKOS_SETTINGS += "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}"
+  endif
+  ifdef KOKKOS_CUDA_OPTIONS
+    KOKKOS_SETTINGS += "KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPTIONS}"
+  endif
+endif
+
+build:
+	mkdir -p 01_hello_world
+	cd ./01_hello_world; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 01_hello_world_lambda
+	cd ./01_hello_world_lambda; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 02_simple_reduce
+	cd ./02_simple_reduce; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 02_simple_reduce_lambda
+	cd ./02_simple_reduce_lambda; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 03_simple_view
+	cd ./03_simple_view; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 03_simple_view_lambda
+	cd ./03_simple_view_lambda; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 04_simple_memoryspaces
+	cd ./04_simple_memoryspaces; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p 05_simple_atomics
+	cd ./05_simple_atomics; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS}
+	mkdir -p Advanced_Views
+	cd ./Advanced_Views; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	mkdir -p Algorithms
+	cd ./Algorithms; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	mkdir -p Hierarchical_Parallelism
+	cd ./Hierarchical_Parallelism; \
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+
+build-insource:
+	cd ./01_hello_world; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./01_hello_world_lambda; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./02_simple_reduce; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./02_simple_reduce_lambda; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./03_simple_view; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./03_simple_view_lambda; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./04_simple_memoryspaces; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./05_simple_atomics; \
+	$(MAKE) build ${KOKKOS_SETTINGS}
+	cd ./Advanced_Views; \
+	$(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	cd ./Algorithms; \
+	$(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	cd ./Hierarchical_Parallelism; \
+	$(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+test:
+	cd ./01_hello_world; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS}
+	cd ./01_hello_world_lambda; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS}
+	cd ./02_simple_reduce; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS}
+	cd ./02_simple_reduce_lambda; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS}
+	cd ./03_simple_view; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS}
+	cd ./03_simple_view_lambda; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS}
+	cd ./04_simple_memoryspaces; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS}
+	cd ./05_simple_atomics; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS}
+	cd ./Advanced_Views; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	cd ./Algorithms; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	cd ./Hierarchical_Parallelism; \
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+
+test-insource:
+	cd ./01_hello_world; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./01_hello_world_lambda; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./02_simple_reduce; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./02_simple_reduce_lambda; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./03_simple_view; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./03_simple_view_lambda; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./04_simple_memoryspaces; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./05_simple_atomics; \
+	$(MAKE) test ${KOKKOS_SETTINGS}
+	cd ./Advanced_Views; \
+	$(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	cd ./Algorithms; \
+	$(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	cd ./Hierarchical_Parallelism; \
+	$(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+clean:
+	cd ./01_hello_world; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS}
+	cd ./01_hello_world_lambda; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS}
+	cd ./02_simple_reduce; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS}
+	cd ./02_simple_reduce_lambda; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS}
+	cd ./03_simple_view; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS}
+	cd ./03_simple_view_lambda; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS}
+	cd ./04_simple_memoryspaces; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS}
+	cd ./05_simple_atomics; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS}
+	cd ./Advanced_Views; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	cd ./Algorithms; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	cd ./Hierarchical_Parallelism; \
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+
+clean-insource:
+	cd ./01_hello_world; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./01_hello_world_lambda; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./02_simple_reduce; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./02_simple_reduce_lambda; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./03_simple_view; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./03_simple_view_lambda; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./04_simple_memoryspaces; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./05_simple_atomics; \
+	$(MAKE) clean ${KOKKOS_SETTINGS}
+	cd ./Advanced_Views; \
+	$(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	cd ./Algorithms; \
+	$(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	cd ./Hierarchical_Parallelism; \
+	$(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
diff --git a/packages/kokkos/example/tutorial/README b/packages/kokkos/example/tutorial/README
new file mode 100644
index 0000000000000000000000000000000000000000..4ba0b3a5d9e15e3c58326559d7a7f30e5b51ea4c
--- /dev/null
+++ b/packages/kokkos/example/tutorial/README
@@ -0,0 +1,17 @@
+Build the examples by typing in each directory: 
+make -j 16
+
+To specify a target device:
+make openmp -j 16
+make pthreads -j 16
+make serial -j 16
+make cuda -j 16
+
+The lambda variants can not be build with CUDA=yes at the moment, since
+CUDA does not support lambdas from the host. 
+Some of the advanced topics try to highlight performance impacts by timing 
+different variants of doing the same thing.
+Also some of the advanced topics (in particular hierarchical parallelism)
+require C++11 even with out using host side lambdas. CUDA 6.5 can be used 
+to compile those. 
+
diff --git a/packages/kokkos/example/tutorial/launch_bounds/CMakeLists.txt b/packages/kokkos/example/tutorial/launch_bounds/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7c78db840f849fd9625676c6a73e8aa037b52b4d
--- /dev/null
+++ b/packages/kokkos/example/tutorial/launch_bounds/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_02_simple_reduce
+  SOURCES simple_reduce.cpp
+  COMM serial mpi
+  )
diff --git a/packages/kokkos/example/tutorial/launch_bounds/Makefile b/packages/kokkos/example/tutorial/launch_bounds/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4a1bf17344c1f71193b54a0ae253227ac733164b
--- /dev/null
+++ b/packages/kokkos/example/tutorial/launch_bounds/Makefile
@@ -0,0 +1,66 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/launch_bounds/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS = 
+EXE = launch_bounds.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =  
+EXE = launch_bounds.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
+CXX = /opt/rocm/hcc/bin/clang++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LDFLAGS =
+EXE = launch_bounds.rocm
+KOKKOS_DEVICES = "ROCm"
+KOKKOS_ARCH = "Fiji"
+endif
+
+
+# WAR for "undefined memcpy" w/ Ubuntu + CUDA 7.5
+CXXFLAGS += -D_FORCE_INLINES
+# Additional compile-time information
+CXXFLAGS += -Xptxas=-v
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+temp:
+	echo $(KOKKOS_INTERNAL_USE_CUDA) $(CUDA_PATH)
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.rocm
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/packages/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp b/packages/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ff979e86e4bf5a930e353173df1a5f7681cc4b7
--- /dev/null
+++ b/packages/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp
@@ -0,0 +1,173 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+//
+// First reduction (parallel_reduce) example:
+//   1. Start up Kokkos
+//   2. Execute a parallel_reduce loop in the default execution space,
+//      using a functor to define the loop body
+//   3. Shut down Kokkos
+//
+struct collision {
+// Reduction functor
+// For each i, we generate 10 hashes, look for and count collisions
+// We use parallel_reduce to count the total collisions
+// Note that we're just counting collisions within the 10 generated
+// one i.
+// This function was chosen as one that very simply can increase the
+// register count.
+  typedef int value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  int hash(int q) const {
+	  // A simple hash by Justin Sobel
+	  // Thanks to Arash Partow (partow.net)
+	  char* fourchars = (char*)&q;
+	  int hash = 1315423911;
+	  for (int i=0; i<4; fourchars++, i++) {
+		  hash ^= ((hash<<5) + *fourchars + (hash >> 2));
+	  }
+	  return hash;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, int& lsum) const {
+	  //This is a silly function which generates 10 hashes
+	  // then checks for collisions
+	  int a = hash(i)%64;
+	  int b = hash(i*3)%64;
+	  int c = hash(i*5)%64;
+	  int d = hash(i*7)%64;
+	  int e = hash(i*11)%64;
+	  int f = hash(i*17)%64;
+	  int g = hash(i*23)%64;
+	  int h = hash(i*29)%64;
+	  int j = hash(i*31)%64;
+	  int k = hash(i*37)%64;
+
+
+	  if (a==b) lsum++;
+	  if (a==c) lsum++;
+	  if (a==d) lsum++;
+	  if (a==e) lsum++;
+	  if (a==f) lsum++;
+	  if (a==g) lsum++;
+	  if (a==h) lsum++;
+	  if (a==j) lsum++;
+	  if (a==k) lsum++;
+	  if (b==c) lsum++;
+	  if (b==d) lsum++;
+	  if (b==e) lsum++;
+	  if (b==f) lsum++;
+	  if (b==g) lsum++;
+	  if (b==h) lsum++;
+	  if (b==j) lsum++;
+	  if (b==k) lsum++;
+	  if (c==d) lsum++;
+	  if (c==e) lsum++;
+	  if (c==f) lsum++;
+	  if (c==g) lsum++;
+	  if (c==h) lsum++;
+	  if (c==j) lsum++;
+	  if (c==k) lsum++;
+	  if (d==e) lsum++;
+	  if (d==f) lsum++;
+	  if (d==g) lsum++;
+	  if (d==h) lsum++;
+	  if (d==j) lsum++;
+	  if (d==k) lsum++;
+	  if (e==f) lsum++;
+	  if (e==g) lsum++;
+	  if (e==h) lsum++;
+	  if (e==j) lsum++;
+	  if (e==k) lsum++;
+	  if (f==g) lsum++;
+	  if (f==h) lsum++;
+	  if (f==j) lsum++;
+	  if (f==k) lsum++;
+	  if (g==h) lsum++;
+	  if (g==j) lsum++;
+	  if (g==k) lsum++;
+	  if (h==j) lsum++;
+	  if (h==k) lsum++;
+	  if (j==k) lsum++;
+  }
+
+
+
+};
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+  const int n = 10000;
+
+  // Compute and count hash collisions in
+  // parallel, using Kokkos.
+  // This is not really a useful algorithm, but it demonstrates the
+  // LaunchBounds functionality
+  int sum1 = 0;
+  int sum2 = 0;
+  
+  //Without LaunchBounds, the kernel uses 56 registers
+  Kokkos::parallel_reduce (n, collision (), sum1);
+
+  //With LaunchBounds, we can reduce the register usage to 32
+  Kokkos::parallel_reduce (Kokkos::RangePolicy<Kokkos::LaunchBounds<512,4>>(0,n), collision (), sum2);
+
+  printf ("Number of collisions, "
+          "computed in parallel, is %i\n", sum1);
+
+  if (sum1 != sum2) {
+	  printf( "Uh-oh! Results do not match\n");
+	  return -1;
+  }
+
+  Kokkos::finalize();
+  
+
+  return 0;
+}
+
diff --git a/packages/kokkos/generate_makefile.bash b/packages/kokkos/generate_makefile.bash
new file mode 100755
index 0000000000000000000000000000000000000000..84b1eea8a84c5c625a6416a6e24bfa10c78ed2c7
--- /dev/null
+++ b/packages/kokkos/generate_makefile.bash
@@ -0,0 +1,486 @@
+#!/bin/bash
+
+KOKKOS_DEVICES=""
+
+KOKKOS_DO_EXAMPLES="1"
+
+while [[ $# > 0 ]]
+do
+  key="$1"
+
+  case $key in
+    --kokkos-path*)
+      KOKKOS_PATH="${key#*=}"
+      ;;
+    --qthreads-path*)
+      QTHREADS_PATH="${key#*=}"
+      ;;
+    --prefix*)
+      PREFIX="${key#*=}"
+      ;;
+    --with-cuda)
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
+      CUDA_PATH_NVCC=`which nvcc`
+      CUDA_PATH=${CUDA_PATH_NVCC%/bin/nvcc}
+      ;;
+    # Catch this before '--with-cuda*'
+    --with-cuda-options*)
+      KOKKOS_CUDA_OPT="${key#*=}"
+      ;;
+    --with-cuda*)
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
+      CUDA_PATH="${key#*=}"
+      ;;
+    --with-rocm)
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},ROCm"
+      ;;
+    --with-openmp)
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},OpenMP"
+      ;;
+    --with-pthread)
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Pthread"
+      ;;
+    --with-serial)
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Serial"
+      ;;
+    --with-qthreads*)
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Qthreads"
+      if [ -z "$QTHREADS_PATH" ]; then
+        QTHREADS_PATH="${key#*=}"
+      fi
+      ;;
+    --with-devices*)
+      DEVICES="${key#*=}"
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},${DEVICES}"
+      ;;
+    --with-gtest*)
+      GTEST_PATH="${key#*=}"
+      ;;
+    --with-hwloc*)
+      HWLOC_PATH="${key#*=}"
+      ;;
+    --with-memkind*)
+      MEMKIND_PATH="${key#*=}"
+      ;;
+    --arch*)
+      KOKKOS_ARCH="${key#*=}"
+      ;;
+    --cxxflags*)
+      CXXFLAGS="${key#*=}"
+      ;;
+    --ldflags*)
+      LDFLAGS="${key#*=}"
+      ;;
+    --debug|-dbg)
+      KOKKOS_DEBUG=yes
+      ;;
+    --make-j*)
+      echo "Warning: ${key} is deprecated"
+      echo "Call make with appropriate -j flag"
+      ;;
+    --no-examples)
+      KOKKOS_DO_EXAMPLES="0"
+      ;;
+    --compiler*)
+      COMPILER="${key#*=}"
+      CNUM=`which ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l`
+      if [ ${CNUM} -gt 0 ]; then
+        echo "Invalid compiler by --compiler command: '${COMPILER}'"
+        exit
+      fi
+      if [[ ! -n  ${COMPILER} ]]; then
+        echo "Empty compiler specified by --compiler command."
+        exit
+      fi
+      CNUM=`which ${COMPILER} | grep ${COMPILER} | wc -l`
+      if [ ${CNUM} -eq 0 ]; then
+        echo "Invalid compiler by --compiler command: '${COMPILER}'"
+        exit
+      fi
+      ;;
+    --with-options*)
+      KOKKOS_OPT="${key#*=}"
+      ;;
+    --help)
+      echo "Kokkos configure options:"
+      echo "--kokkos-path=/Path/To/Kokkos:        Path to the Kokkos root directory."
+      echo "--qthreads-path=/Path/To/Qthreads:    Path to Qthreads install directory."
+      echo "                                        Overrides path given by --with-qthreads."
+      echo "--prefix=/Install/Path:               Path to install the Kokkos library."
+      echo ""
+      echo "--with-cuda[=/Path/To/Cuda]:          Enable Cuda and set path to Cuda Toolkit."
+      echo "--with-openmp:                        Enable OpenMP backend."
+      echo "--with-pthread:                       Enable Pthreads backend."
+      echo "--with-serial:                        Enable Serial backend."
+      echo "--with-qthreads[=/Path/To/Qthreads]:  Enable Qthreads backend."
+      echo "--with-devices:                       Explicitly add a set of backends."
+      echo ""
+      echo "--arch=[OPT]:  Set target architectures. Options are:"
+      echo "               [AMD]"
+      echo "                 AMDAVX          = AMD CPU"
+      echo "               [ARM]"
+      echo "                 ARMv80          = ARMv8.0 Compatible CPU"
+      echo "                 ARMv81          = ARMv8.1 Compatible CPU"
+      echo "                 ARMv8-ThunderX  = ARMv8 Cavium ThunderX CPU"
+      echo "                 ARMv8-TX2       = ARMv8 Cavium ThunderX2 CPU"
+      echo "               [IBM]"
+      echo "                 BGQ             = IBM Blue Gene Q"
+      echo "                 Power7          = IBM POWER7 and POWER7+ CPUs"
+      echo "                 Power8          = IBM POWER8 CPUs"
+      echo "                 Power9          = IBM POWER9 CPUs"
+      echo "               [Intel]"
+      echo "                 WSM             = Intel Westmere CPUs"
+      echo "                 SNB             = Intel Sandy/Ivy Bridge CPUs"
+      echo "                 HSW             = Intel Haswell CPUs"
+      echo "                 BDW             = Intel Broadwell Xeon E-class CPUs"
+      echo "                 SKX             = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)"
+      echo "               [Intel Xeon Phi]"
+      echo "                 KNC             = Intel Knights Corner Xeon Phi"
+      echo "                 KNL             = Intel Knights Landing Xeon Phi"
+      echo "               [NVIDIA]"
+      echo "                 Kepler30        = NVIDIA Kepler generation CC 3.0"
+      echo "                 Kepler32        = NVIDIA Kepler generation CC 3.2"
+      echo "                 Kepler35        = NVIDIA Kepler generation CC 3.5"
+      echo "                 Kepler37        = NVIDIA Kepler generation CC 3.7"
+      echo "                 Maxwell50       = NVIDIA Maxwell generation CC 5.0"
+      echo "                 Maxwell52       = NVIDIA Maxwell generation CC 5.2"
+      echo "                 Maxwell53       = NVIDIA Maxwell generation CC 5.3"
+      echo "                 Pascal60        = NVIDIA Pascal generation CC 6.0"
+      echo "                 Pascal61        = NVIDIA Pascal generation CC 6.1"
+      echo "                 Volta70         = NVIDIA Volta generation CC 7.0"
+      echo "                 Volta72         = NVIDIA Volta generation CC 7.2"
+      echo ""
+      echo "--compiler=/Path/To/Compiler  Set the compiler."
+      echo "--debug,-dbg:                 Enable Debugging."
+      echo "--cxxflags=[FLAGS]            Overwrite CXXFLAGS for library build and test"
+      echo "                                build.  This will still set certain required"
+      echo "                                flags via KOKKOS_CXXFLAGS (such as -fopenmp,"
+      echo "                                --std=c++11, etc.)."
+      echo "--ldflags=[FLAGS]             Overwrite LDFLAGS for library build and test"
+      echo "                                build. This will still set certain required"
+      echo "                                flags via KOKKOS_LDFLAGS (such as -fopenmp,"
+      echo "                                -lpthread, etc.)."
+      echo "--with-gtest=/Path/To/Gtest:  Set path to gtest.  (Used in unit and performance"
+      echo "                                tests.)"
+      echo "--with-hwloc=/Path/To/Hwloc:  Set path to hwloc library."
+      echo "--with-memkind=/Path/To/MemKind:  Set path to memkind library."
+      echo "--with-options=[OPT]:         Additional options to Kokkos:"
+      echo "                                compiler_warnings"
+      echo "                                aggressive_vectorization = add ivdep on loops"
+      echo "                                disable_profiling = do not compile with profiling hooks"
+      echo "                                "
+      echo "--with-cuda-options=[OPT]:    Additional options to CUDA:"
+      echo "                                force_uvm, use_ldg, enable_lambda, rdc"
+      echo "--make-j=[NUM]:               DEPRECATED: call make with appropriate"
+      echo "                                -j flag"
+      exit 0
+      ;;
+    *)
+      echo "warning: ignoring unknown option $key"
+      ;;
+  esac
+
+  shift
+done
+
+# Remove leading ',' from KOKKOS_DEVICES.
+KOKKOS_DEVICES=$(echo $KOKKOS_DEVICES | sed 's/^,//')
+
+# If KOKKOS_PATH undefined, assume parent dir of this script is the KOKKOS_PATH.
+if [ -z "$KOKKOS_PATH" ]; then
+  KOKKOS_PATH=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+else
+  # Ensure KOKKOS_PATH is abs path
+  KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
+fi
+
+if [ "${KOKKOS_PATH}"  = "${PWD}" ] || [ "${KOKKOS_PATH}"  = "${PWD}/" ]; then
+  echo "Running generate_makefile.sh in the Kokkos root directory is not allowed"
+  exit
+fi
+
+KOKKOS_SRC_PATH=${KOKKOS_PATH}
+
+KOKKOS_SETTINGS="KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH}"
+#KOKKOS_SETTINGS="KOKKOS_PATH=${KOKKOS_PATH}"
+
+if [ ${#COMPILER} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXX=${COMPILER}"
+fi
+
+if [ ${#KOKKOS_DEVICES} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEVICES=${KOKKOS_DEVICES}"
+fi
+
+if [ ${#KOKKOS_ARCH} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_ARCH=${KOKKOS_ARCH}"
+fi
+
+if [ ${#KOKKOS_DEBUG} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEBUG=${KOKKOS_DEBUG}"
+fi
+
+if [ ${#CUDA_PATH} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CUDA_PATH=${CUDA_PATH}"
+fi
+
+if [ ${#CXXFLAGS} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXXFLAGS=\"${CXXFLAGS}\""
+fi
+
+if [ ${#LDFLAGS} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\""
+fi
+
+if [ ${#GTEST_PATH} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}"
+else
+  GTEST_PATH=${KOKKOS_PATH}/tpls/gtest
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}"
+fi
+
+if [ ${#HWLOC_PATH} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} HWLOC_PATH=${HWLOC_PATH}"
+  KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc"
+fi
+
+if [ ${#MEMKIND_PATH} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" 
+  KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind"
+fi
+
+if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}"
+fi
+
+if [ ${#QTHREADS_PATH} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} QTHREADS_PATH=${QTHREADS_PATH}"
+fi
+
+if [ ${#KOKKOS_OPT} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_OPTIONS=${KOKKOS_OPT}"
+fi
+
+if [ ${#KOKKOS_CUDA_OPT} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPT}"
+fi
+
+KOKKOS_SETTINGS_NO_KOKKOS_PATH="${KOKKOS_SETTINGS}"
+
+KOKKOS_TEST_INSTALL_PATH="${PWD}/install"
+if [ ${#PREFIX} -gt 0 ]; then
+  KOKKOS_INSTALL_PATH="${PREFIX}"
+else
+  KOKKOS_INSTALL_PATH=${KOKKOS_TEST_INSTALL_PATH}
+fi
+
+mkdir -p install
+gen_makefile=Makefile.kokkos
+echo "#Makefile to satisfy existens of target kokkos-clean before installing the library" > install/${gen_makefile}
+echo "kokkos-clean:" >> install/${gen_makefile}
+echo "" >> install/${gen_makefile}
+mkdir -p core
+mkdir -p core/unit_test
+mkdir -p core/perf_test
+mkdir -p containers
+mkdir -p containers/unit_tests
+mkdir -p containers/performance_tests
+mkdir -p algorithms
+mkdir -p algorithms/unit_tests
+mkdir -p algorithms/performance_tests
+mkdir -p example
+mkdir -p example/fixture
+mkdir -p example/feint
+mkdir -p example/fenl
+mkdir -p example/tutorial
+
+if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then
+  mkdir -p example/ichol
+fi
+
+KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}"
+
+# Generate subdirectory makefiles.
+echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/unit_test/Makefile
+echo "" >> core/unit_test/Makefile
+echo "all:" >> core/unit_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS}" >> core/unit_test/Makefile
+echo "" >> core/unit_test/Makefile
+echo "test: all" >> core/unit_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} test" >> core/unit_test/Makefile
+echo "" >> core/unit_test/Makefile
+echo "clean:" >> core/unit_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/unit_test/Makefile
+
+echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/perf_test/Makefile
+echo "" >> core/perf_test/Makefile
+echo "all:" >> core/perf_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS}" >> core/perf_test/Makefile
+echo "" >> core/perf_test/Makefile
+echo "test: all" >> core/perf_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} test" >> core/perf_test/Makefile
+echo "" >> core/perf_test/Makefile
+echo "clean:" >> core/perf_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/perf_test/Makefile
+
+echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/unit_tests/Makefile
+echo "" >> containers/unit_tests/Makefile
+echo "all:" >> containers/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/unit_tests/Makefile
+echo "" >> containers/unit_tests/Makefile
+echo "test: all" >> containers/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/unit_tests/Makefile
+echo "" >> containers/unit_tests/Makefile
+echo "clean:" >> containers/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/unit_tests/Makefile
+
+echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/performance_tests/Makefile
+echo "" >> containers/performance_tests/Makefile
+echo "all:" >> containers/performance_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/performance_tests/Makefile
+echo "" >> containers/performance_tests/Makefile
+echo "test: all" >> containers/performance_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/performance_tests/Makefile
+echo "" >> containers/performance_tests/Makefile
+echo "clean:" >> containers/performance_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/performance_tests/Makefile
+
+echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > algorithms/unit_tests/Makefile
+echo "" >> algorithms/unit_tests/Makefile
+echo "all:" >> algorithms/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> algorithms/unit_tests/Makefile
+echo "" >> algorithms/unit_tests/Makefile
+echo "test: all" >> algorithms/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> algorithms/unit_tests/Makefile
+echo "" >> algorithms/unit_tests/Makefile
+echo "clean:" >> algorithms/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> algorithms/unit_tests/Makefile
+
+KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_TEST_INSTALL_PATH}"
+
+echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/fixture/Makefile
+echo "" >> example/fixture/Makefile
+echo "all:" >> example/fixture/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS}" >> example/fixture/Makefile
+echo "" >> example/fixture/Makefile
+echo "test: all" >> example/fixture/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} test" >> example/fixture/Makefile
+echo "" >> example/fixture/Makefile
+echo "clean:" >> example/fixture/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} clean" >> example/fixture/Makefile
+
+echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/feint/Makefile
+echo "" >> example/feint/Makefile
+echo "all:" >> example/feint/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS}" >> example/feint/Makefile
+echo "" >> example/feint/Makefile
+echo "test: all" >> example/feint/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} test" >> example/feint/Makefile
+echo "" >> example/feint/Makefile
+echo "clean:" >> example/feint/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} clean" >> example/feint/Makefile
+
+echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/fenl/Makefile
+echo "" >> example/fenl/Makefile
+echo "all:" >> example/fenl/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS}" >> example/fenl/Makefile
+echo "" >> example/fenl/Makefile
+echo "test: all" >> example/fenl/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} test" >> example/fenl/Makefile
+echo "" >> example/fenl/Makefile
+echo "clean:" >> example/fenl/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} clean" >> example/fenl/Makefile
+
+echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/tutorial/Makefile
+echo "" >> example/tutorial/Makefile
+echo "build:" >> example/tutorial/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} build">> example/tutorial/Makefile
+echo "" >> example/tutorial/Makefile
+echo "test: build" >> example/tutorial/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} test" >> example/tutorial/Makefile
+echo "" >> example/tutorial/Makefile
+echo "clean:" >> example/tutorial/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} clean" >> example/tutorial/Makefile
+
+if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then
+echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/ichol/Makefile
+echo "" >> example/ichol/Makefile
+echo "all:" >> example/ichol/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS}" >> example/ichol/Makefile
+echo "" >> example/ichol/Makefile
+echo "test: all" >> example/ichol/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} test" >> example/ichol/Makefile
+echo "" >> example/ichol/Makefile
+echo "clean:" >> example/ichol/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} clean" >> example/ichol/Makefile
+fi
+
+KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}"
+
+# Generate top level directory makefile.
+echo "Generating Makefiles with options " ${KOKKOS_SETTINGS}
+echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > Makefile
+echo "" >> Makefile
+echo "kokkoslib:" >> Makefile
+echo -e "\tcd core; \\" >> Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} build-lib" >> Makefile
+echo "" >> Makefile
+echo "install: kokkoslib" >> Makefile
+echo -e "\tcd core; \\" >> Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} install" >> Makefile
+echo "" >> Makefile
+echo "kokkoslib-test:" >> Makefile
+echo -e "\tcd core; \\" >> Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} build-lib" >> Makefile
+echo "" >> Makefile
+echo "install-test: kokkoslib-test" >> Makefile
+echo -e "\tcd core; \\" >> Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} install" >> Makefile
+echo "" >> Makefile
+echo "build-test: install-test" >> Makefile
+echo -e "\t\$(MAKE) -C core/unit_test" >> Makefile
+echo -e "\t\$(MAKE) -C core/perf_test" >> Makefile
+echo -e "\t\$(MAKE) -C containers/unit_tests" >> Makefile
+echo -e "\t\$(MAKE) -C containers/performance_tests" >> Makefile
+echo -e "\t\$(MAKE) -C algorithms/unit_tests" >> Makefile
+if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then
+echo -e "\t\$(MAKE) -C example/fixture" >> Makefile
+echo -e "\t\$(MAKE) -C example/feint" >> Makefile
+echo -e "\t\$(MAKE) -C example/fenl" >> Makefile
+echo -e "\t\$(MAKE) -C example/tutorial build" >> Makefile
+fi
+echo "" >> Makefile
+echo "test: build-test" >> Makefile
+echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile
+echo -e "\t\$(MAKE) -C core/perf_test test" >> Makefile
+echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile
+echo -e "\t\$(MAKE) -C containers/performance_tests test" >> Makefile
+echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile
+if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then
+echo -e "\t\$(MAKE) -C example/fixture test" >> Makefile
+echo -e "\t\$(MAKE) -C example/feint test" >> Makefile
+echo -e "\t\$(MAKE) -C example/fenl test" >> Makefile
+echo -e "\t\$(MAKE) -C example/tutorial test" >> Makefile
+fi
+echo "" >> Makefile
+echo "unit-tests-only:" >> Makefile
+echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile
+echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile
+echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile
+echo "" >> Makefile
+
+echo "clean:" >> Makefile
+echo -e "\t\$(MAKE) -C core/unit_test clean" >> Makefile
+echo -e "\t\$(MAKE) -C core/perf_test clean" >> Makefile
+echo -e "\t\$(MAKE) -C containers/unit_tests clean" >> Makefile
+echo -e "\t\$(MAKE) -C containers/performance_tests clean" >> Makefile
+echo -e "\t\$(MAKE) -C algorithms/unit_tests clean" >> Makefile
+if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then
+echo -e "\t\$(MAKE) -C example/fixture clean" >> Makefile
+echo -e "\t\$(MAKE) -C example/feint clean" >> Makefile
+echo -e "\t\$(MAKE) -C example/fenl clean" >> Makefile
+echo -e "\t\$(MAKE) -C example/tutorial clean" >> Makefile
+fi
+echo -e "\tcd core; \\" >> Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} clean" >> Makefile
+
diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f47017b2fb834c9738d60e195dac1139a980eb3d
--- /dev/null
+++ b/packages/kokkos/master_history.txt
@@ -0,0 +1,15 @@
+tag:  2.01.00    date: 07:21:2016    master: xxxxxxxx    develop: fa6dfcc4
+tag:  2.01.06    date: 09:02:2016    master: 9afaa87f    develop: 555f1a3a
+tag:  2.01.10    date: 09:27:2016    master: e4119325    develop: e6cda11e
+tag:  2.02.00    date: 10:30:2016    master: 6c90a581    develop: ca3dd56e
+tag:  2.02.01    date: 11:01:2016    master: 9c698c86    develop: b0072304
+tag:  2.02.07    date: 12:16:2016    master: 4b4cc4ba    develop: 382c0966
+tag:  2.02.15    date: 02:10:2017    master: 8c64cd93    develop: 28dea8b6
+tag:  2.03.00    date: 04:25:2017    master: 120d9ce7    develop: 015ba641
+tag:  2.03.05    date: 05:27:2017    master: 36b92f43    develop: 79073186
+tag:  2.03.13    date: 07:27:2017    master: da314444    develop: 29ccb58a
+tag:  2.04.00    date: 08:16:2017    master: 54eb75c0    develop: 32fb8ee1
+tag:  2.04.04    date: 09:11:2017    master: 2b7e9c20    develop: 51e7b25a
+tag:  2.04.11    date: 10:28:2017    master: 54a1330a    develop: ed36c017
+tag:  2.5.00     date: 12:15:2017    master: dfe685f4    develop: ec7ad6d8
+tag:  2.6.00     date: 03:07:2018    master: 62e760fa    develop: d1ba7d71
diff --git a/packages/kokkos/scripts/snapshot.py b/packages/kokkos/scripts/snapshot.py
new file mode 100755
index 0000000000000000000000000000000000000000..bfa97bf48a2909fe5395d2cea08b6eb6336d6ee3
--- /dev/null
+++ b/packages/kokkos/scripts/snapshot.py
@@ -0,0 +1,291 @@
+#! /usr/bin/env python
+
+"""
+Snapshot a project into another project and perform the necessary repo actions
+to provide a commit message that can be used to trace back to the exact point
+in the source repository.
+"""
+
+#todo:
+#  Support svn
+#  Allow renaming of the source dir in the destination path
+#  Check if a new snapshot is necessary?
+#
+
+import sys
+
+#check the version number so that there is a good error message when argparse is not available.
+#This checks for exactly 2.7 which is bad, but it is a python 2 script and argparse was introduced
+#in 2.7 which is also the last version of python 2. If this script is updated for python 3 this
+#will need to change, but for now it is not safe to allow 3.x to run this.
+if sys.version_info[:2] != (2, 7):
+  print "Error snapshot requires python 2.7 detected version is %d.%d." % (sys.version_info[0], sys.version_info[1])
+  sys.exit(1)
+
+import subprocess, argparse, re, doctest, os, datetime, traceback
+
+def parse_cmdline(description):
+  parser = argparse.ArgumentParser(usage="snapshot.py [options] source destination", description=description)
+
+  parser.add_argument("-n", "--no-commit", action="store_false", dest="create_commit", default=True,
+                      help="Do not perform a commit or create a commit message.")
+  parser.add_argument("-v", "--verbose", action="store_true", dest="verbose_mode", default=False,
+                      help="Enable verbose mode.")
+  parser.add_argument("-d", "--debug", action="store_true", dest="debug_mode", default=False,
+                      help="Enable debugging output.")
+  parser.add_argument("--no-validate-repo", action="store_true", dest="no_validate_repo", default=False,
+                      help="Reduce the validation that the source and destination repos are clean to a warning.")
+  parser.add_argument("--source-repo", choices=["git","none"], default="",
+                      help="Type of repository of the source, use none to skip all repository operations.")
+  parser.add_argument("--dest-repo", choices=["git","none"], default="",
+                      help="Type of repository of the destination, use none to skip all repository operations.")
+  parser.add_argument("--small", action="store_true", dest="small_mode",
+                      help="Don't include tests and other extra files when copying.")
+
+  parser.add_argument("source",      help="Source project to snapshot from.")
+  parser.add_argument("destination", help="Destination to snapshot too.")
+
+  options = parser.parse_args()
+  options = validate_options(options)
+  return options
+#end parseCmdline
+
+def validate_options(options):
+  apparent_source_repo_type="none"
+  apparent_dest_repo_type="none"
+
+  #prevent user from accidentally giving us a path that rsync will treat differently than expected.
+  options.source      = options.source.rstrip(os.sep)
+  options.destination = options.destination.rstrip(os.sep)
+
+  options.source      = os.path.abspath(options.source)
+  options.destination = os.path.abspath(options.destination)
+
+  if os.path.exists(options.source):
+    apparent_source_repo_type, source_root = determine_repo_type(options.source)
+  else:
+    raise RuntimeError("Could not find source directory of %s." % options.source)
+  options.source_root = source_root
+
+  if not os.path.exists(options.destination):
+    print "Could not find destination directory of %s so it will be created." % options.destination
+    os.makedirs(options.destination)
+
+  apparent_dest_repo_type, dest_root = determine_repo_type(options.destination)
+  options.dest_root = dest_root
+
+  #error on svn repo types for now
+  if apparent_source_repo_type == "svn" or apparent_dest_repo_type == "svn":
+    raise RuntimeError("SVN repositories are not supported at this time.")
+
+  if options.source_repo == "":
+    #source repo type is not specified to just using the apparent type.
+    options.source_repo = apparent_source_repo_type
+  else:
+    if options.source_repo != "none" and options.source_repo != apparent_source_repo_type:
+      raise RuntimeError("Specified source repository type of %s conflicts with determined type of %s" % \
+        (options.source_repo, apparent_source_repo_type))
+
+  if options.dest_repo == "":
+    #destination repo type is not specified to just using the apparent type.
+    options.dest_repo = apparent_dest_repo_type
+  else:
+    if options.dest_repo != "none" and options.dest_repo != apparent_dest_repo_type:
+      raise RuntimeError("Specified destination repository type of %s conflicts with determined type of %s" % \
+        (options.dest_repo, apparent_dest_repo_type))
+
+  return options
+#end validate_options
+
+def run_cmd(cmd, options, working_dir="."):
+  cmd_str = " ".join(cmd)
+  if options.verbose_mode:
+    print "Running command '%s' in dir %s." % (cmd_str, working_dir)
+
+  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=working_dir)
+  proc_stdout, proc_stderr = proc.communicate()
+  ret_val = proc.wait()
+
+  if options.debug_mode:
+    print "==== %s stdout start ====" % cmd_str
+    print proc_stdout
+    print "==== %s stdout end ====" % cmd_str
+    print "==== %s stderr ====" % cmd_str
+    print proc_stderr
+    print "==== %s stderr ====" % cmd_str
+
+  if ret_val != 0:
+    raise RuntimeError("Command '%s' failed with error code %d. Error message:%s%s%sstdout:%s" % \
+      (cmd_str, ret_val, os.linesep, proc_stderr, os.linesep, proc_stdout))
+
+  return proc_stdout, proc_stderr
+#end run_cmd
+
+def determine_repo_type(location):
+  apparent_repo_type = "none"
+
+  while location != "":
+    if os.path.exists(os.path.join(location, ".git")):
+      apparent_repo_type = "git"
+      break
+    elif os.path.exists(os.path.join(location, ".svn")):
+      apparent_repo_type = "svn"
+      break
+    else:
+      location = location[:location.rfind(os.sep)]
+
+  return apparent_repo_type, location
+#end determine_repo_type
+
+def rsync(source, dest, options):
+  rsync_cmd = ["rsync", "-ar", "--delete"]
+  if options.debug_mode:
+    rsync_cmd.append("-v")
+
+  if options.small_mode or options.source_repo == "git":
+    rsync_cmd.append("--delete-excluded")
+
+  if options.small_mode:
+    rsync_cmd.append("--include=config/master_history.txt")
+    rsync_cmd.append("--include=cmake/tpls")
+    rsync_cmd.append("--exclude=benchmarks/")
+    rsync_cmd.append("--exclude=config/*")
+    rsync_cmd.append("--exclude=doc/")
+    rsync_cmd.append("--exclude=example/")
+    rsync_cmd.append("--exclude=tpls/")
+    rsync_cmd.append("--exclude=HOW_TO_SNAPSHOT")
+    rsync_cmd.append("--exclude=unit_test")
+    rsync_cmd.append("--exclude=unit_tests")
+    rsync_cmd.append("--exclude=perf_test")
+    rsync_cmd.append("--exclude=performance_tests")
+
+  if options.source_repo == "git":
+    rsync_cmd.append("--exclude=.git*")
+
+  rsync_cmd.append(options.source)
+  rsync_cmd.append(options.destination)
+  run_cmd(rsync_cmd, options)
+#end rsync
+
+def create_commit_message(commit_id, commit_log, project_name, project_location):
+  eol = os.linesep
+  message = "Snapshot of %s from commit %s" % (project_name, commit_id)
+  message += eol * 2
+  message += "From repository at %s" % project_location
+  message += eol * 2
+  message += "At commit:" + eol
+  message += commit_log
+  return message
+#end create_commit_message
+
+def find_git_commit_information(options):
+  r"""
+  >>> class fake_options:
+  ...   source="."
+  ...   verbose_mode=False
+  ...   debug_mode=False
+  >>> myoptions = fake_options()
+  >>> find_git_commit_information(myoptions)[2:]
+  ('sems', 'software.sandia.gov:/git/sems')
+  """
+  git_log_cmd = ["git", "log", "-1"]
+
+  output, error = run_cmd(git_log_cmd, options, options.source)
+
+  commit_match = re.match("commit ([0-9a-fA-F]+)", output)
+  commit_id = commit_match.group(1)
+  commit_log = output
+
+  git_remote_cmd = ["git", "remote", "-v"]
+  output, error = run_cmd(git_remote_cmd, options, options.source)
+
+  remote_match = re.search("origin\s([^ ]*/([^ ]+))", output, re.MULTILINE)
+  if not remote_match:
+    raise RuntimeError("Could not find origin of repo at %s. Consider using none for source repo type." % (options.source))
+
+  source_location = remote_match.group(1)
+  source_name     = remote_match.group(2).strip()
+
+  if source_name[-1] == "/":
+    source_name = source_name[:-1]
+
+  return commit_id, commit_log, source_name, source_location
+#end find_git_commit_information
+
+def do_git_commit(message, options):
+  if options.verbose_mode:
+    print "Commiting to destination repository."
+
+  git_add_cmd = ["git", "add", "-A"]
+  run_cmd(git_add_cmd, options, options.destination)
+
+  git_commit_cmd = ["git", "commit", "-m%s" % message]
+  run_cmd(git_commit_cmd, options, options.destination)
+
+  git_log_cmd = ["git", "log", "--format=%h", "-1"]
+  commit_sha1, error = run_cmd(git_log_cmd, options, options.destination)
+
+  print "Commit %s was made to %s." % (commit_sha1.strip(), options.dest_root)
+#end do_git_commit
+
+def verify_git_repo_clean(location, options):
+  git_status_cmd = ["git", "status", "--porcelain"]
+  output, error = run_cmd(git_status_cmd, options, location)
+
+  if output != "":
+    if options.no_validate_repo == False:
+      raise RuntimeError("%s is not clean.%sPlease commit or stash all changes before running snapshot."
+        % (location, os.linesep))
+    else:
+      print "WARNING: %s is not clean. Proceeding anyway." % location
+      print "WARNING:   This could lead to differences in the source and destination."
+      print "WARNING:   It could also lead to extra files being included in the snapshot commit."
+#end verify_git_repo_clean
+
+def main(options):
+  if options.verbose_mode:
+    print "Snapshotting %s to %s." % (options.source, options.destination)
+
+  if options.source_repo == "git":
+    verify_git_repo_clean(options.source, options)
+    commit_id, commit_log, repo_name, repo_location = find_git_commit_information(options)
+  elif options.source_repo == "none":
+    commit_id     = "N/A"
+    commit_log    = "Unknown commit from %s snapshotted at: %s" % (options.source, datetime.datetime.now())
+    repo_name     = options.source
+    repo_location = options.source
+
+  commit_message = create_commit_message(commit_id, commit_log, repo_name, repo_location) + os.linesep*2
+
+  if options.dest_repo == "git":
+    verify_git_repo_clean(options.destination, options)
+
+  rsync(options.source, options.destination, options)
+
+  if options.dest_repo == "git":
+    do_git_commit(commit_message, options)
+  elif options.dest_repo == "none":
+    file_name = "snapshot_message.txt"
+    message_file = open(file_name, "w")
+    message_file.write(commit_message)
+    message_file.close()
+    cwd = os.getcwd()
+    print "No commit done by request. Please use file at:"
+    print "%s%sif you wish to commit this to a repo later." % (cwd+"/"+file_name, os.linesep)
+#end main
+
+if (__name__ == "__main__"):
+  if ("--test" in sys.argv):
+    doctest.testmod()
+    sys.exit(0)
+
+  try:
+    options = parse_cmdline(__doc__)
+    main(options)
+  except RuntimeError, e:
+    print "Error occured:", e
+    if "--debug" in sys.argv:
+      traceback.print_exc()
+    sys.exit(1)
+  else:
+    sys.exit(0)
diff --git a/packages/kokkos/scripts/testing_scripts/README b/packages/kokkos/scripts/testing_scripts/README
new file mode 100644
index 0000000000000000000000000000000000000000..455afffd840514e98686dadcd2c46a774590456c
--- /dev/null
+++ b/packages/kokkos/scripts/testing_scripts/README
@@ -0,0 +1,5 @@
+jenkins_test_driver is designed to be run through Jenkins as a
+multiconfiguration job. It relies on a number of environment variables that will
+only be set when run in that context. It is possible to override these if you
+know the Jenkins job setup. It is not recommended that a non-expert try to run
+this script directly.
diff --git a/packages/kokkos/scripts/testing_scripts/jenkins_test_driver b/packages/kokkos/scripts/testing_scripts/jenkins_test_driver
new file mode 100755
index 0000000000000000000000000000000000000000..f393940304ee8e679440871414376283f8eef9a7
--- /dev/null
+++ b/packages/kokkos/scripts/testing_scripts/jenkins_test_driver
@@ -0,0 +1,83 @@
+#!/bin/bash -x
+
+echo "Building for BUILD_TYPE = ${BUILD_TYPE}"
+echo "Building with HOST_COMPILER = ${HOST_COMPILER}"
+echo "Building in ${WORKSPACE}"
+
+module use /home/projects/modulefiles
+
+BUILD_TYPE=`echo $BUILD_TYPE | tr "~" " "`
+build_options=""
+for item in ${BUILD_TYPE}; do
+  build_options="$build_options --with-$item"
+done
+
+kokkos_path=${WORKSPACE}/kokkos
+gtest_path=${WORKSPACE}/kokkos/tpls/gtest
+
+echo ${WORKSPACE}
+pwd
+
+#extract information from the provided parameters.
+host_compiler_brand=`echo $HOST_COMPILER | grep -o "^[a-zA-Z]*"`
+cuda_compiler=`echo $BUILD_TYPE | grep -o "cuda_[^ ]*"`
+
+host_compiler_module=`echo $HOST_COMPILER | tr "_" "/"`
+cuda_compiler_module=`echo $cuda_compiler | tr "_" "/"`
+build_path=`echo $BUILD_TYPE | tr " " "_"`
+
+module load $host_compiler_module
+module load $cuda_compiler_module
+
+case $host_compiler_brand in
+  gcc)
+    module load nvcc-wrapper/gnu
+    compiler=g++
+    ;;
+  intel)
+    module load nvcc-wrapper/intel
+    compiler=icpc
+    ;;
+  *)
+    echo "Unrecognized compiler brand."
+    exit 1
+    ;;
+esac
+
+#if cuda is on we need to set the host compiler for the
+#nvcc wrapper and make the wrapper the compiler.
+if [ $cuda_compiler != "" ]; then
+  export NVCC_WRAPPER_DEFAULT_COMPILER=$compiler
+  compiler=$kokkos_path/bin/nvcc_wrapper
+fi
+
+if [ $host_compiler_brand == "intel" -a $cuda_compiler != "" ]; then
+  echo "Intel compilers are not supported with cuda at this time."
+  exit 0
+fi
+
+rm -rf test-$build_path
+mkdir test-$build_path
+cd test-$build_path
+
+/bin/bash $kokkos_path/generate_makefile.bash $build_options --kokkos-path="$kokkos_path" --with-gtest="$gtest_path" --compiler=$compiler 2>&1 |tee configure.out
+
+if [ ${PIPESTATUS[0]} != 0 ]; then
+  echo "Configure failed."
+  exit 1
+fi
+
+make build-test 2>&1 | tee build.log
+
+if [ ${PIPESTATUS[0]} != 0 ]; then
+  echo "Build failed."
+  exit 1
+fi
+
+make test 2>&1 | tee test.log
+
+grep "FAIL" test.log
+if [ $? == 0 ]; then
+  echo "Tests failed."
+  exit 1
+fi
diff --git a/packages/kokkos/scripts/testing_scripts/obj_size_opt_check b/packages/kokkos/scripts/testing_scripts/obj_size_opt_check
new file mode 100755
index 0000000000000000000000000000000000000000..47c84d1a92a8a288115ecf0d416d57b349fb69b4
--- /dev/null
+++ b/packages/kokkos/scripts/testing_scripts/obj_size_opt_check
@@ -0,0 +1,287 @@
+#! /usr/bin/env python
+
+"""
+Compute the size at which the current compiler will start to
+significantly scale back optimization.
+
+The CPP file being modified will need the following tags.
+// JGF_DUPLICATE_BEGIN - Put before start of function to duplicate
+// JGF_DUPLICATE_END - Put after end of function to duplcate
+// JGF_DUPE function_name(args); - Put anywhere where it's legal to
+put a function call but not in your timing section.
+
+The program will need to output the string:
+FOM: <number>
+This will represent the program's performance
+"""
+
+import argparse, sys, os, doctest, subprocess, re, time
+
+VERBOSE = False
+
+###############################################################################
+def parse_command_line(args, description):
+###############################################################################
+    parser = argparse.ArgumentParser(
+        usage="""\n%s <cppfile> <build-command> <run-command> [--verbose]
+OR
+%s --help
+OR
+%s --test
+
+\033[1mEXAMPLES:\033[0m
+    > %s foo.cpp 'make -j4' foo
+""" % ((os.path.basename(args[0]), ) * 4),
+
+description=description,
+
+formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+    parser.add_argument("cppfile", help="Name of file to modify.")
+
+    parser.add_argument("buildcmd", help="Build command")
+
+    parser.add_argument("execmd", help="Run command")
+
+    parser.add_argument("-v", "--verbose", action="store_true",
+                        help="Print extra information")
+
+    parser.add_argument("-s", "--start", type=int, default=1,
+                        help="Starting number of dupes")
+
+    parser.add_argument("-e", "--end", type=int, default=1000,
+                        help="Ending number of dupes")
+
+    parser.add_argument("-n", "--repeat", type=int, default=10,
+                        help="Number of times to repeat an individial execution. Best value will be taken.")
+
+    parser.add_argument("-t", "--template", action="store_true",
+                        help="Use templating instead of source copying to increase object size")
+
+    parser.add_argument("-c", "--csv", action="store_true",
+                        help="Print results as CSV")
+
+    args = parser.parse_args(args[1:])
+
+    if (args.verbose):
+        global VERBOSE
+        VERBOSE = True
+
+    return args.cppfile, args.buildcmd, args.execmd, args.start, args.end, args.repeat, args.template, args.csv
+
+###############################################################################
+def verbose_print(msg, override=None):
+###############################################################################
+    if ( (VERBOSE and not override is False) or override):
+        print msg
+
+###############################################################################
+def error_print(msg):
+###############################################################################
+    print >> sys.stderr, msg
+
+###############################################################################
+def expect(condition, error_msg):
+###############################################################################
+    """
+    Similar to assert except doesn't generate an ugly stacktrace. Useful for
+    checking user error, not programming error.
+    """
+    if (not condition):
+        raise SystemExit("FAIL: %s" % error_msg)
+
+###############################################################################
+def run_cmd(cmd, ok_to_fail=False, input_str=None, from_dir=None, verbose=None,
+            arg_stdout=subprocess.PIPE, arg_stderr=subprocess.PIPE):
+###############################################################################
+    verbose_print("RUN: %s" % cmd, verbose)
+
+    if (input_str is not None):
+        stdin = subprocess.PIPE
+    else:
+        stdin = None
+
+    proc = subprocess.Popen(cmd,
+                            shell=True,
+                            stdout=arg_stdout,
+                            stderr=arg_stderr,
+                            stdin=stdin,
+                            cwd=from_dir)
+    output, errput = proc.communicate(input_str)
+    output = output.strip() if output is not None else output
+    stat = proc.wait()
+
+    if (ok_to_fail):
+        return stat, output, errput
+    else:
+        if (arg_stderr is not None):
+            errput = errput if errput is not None else open(arg_stderr.name, "r").read()
+            expect(stat == 0, "Command: '%s' failed with error '%s'" % (cmd, errput))
+        else:
+            expect(stat == 0, "Command: '%s' failed. See terminal output" % cmd)
+        return output
+
+###############################################################################
+def build_and_run(source, cppfile, buildcmd, execmd, repeat):
+###############################################################################
+    open(cppfile, 'w').writelines(source)
+
+    run_cmd(buildcmd)
+
+    best = None
+    for i in xrange(repeat):
+        wait_for_quiet_machine()
+        output = run_cmd(execmd)
+
+        current = None
+        fom_regex = re.compile(r'^FOM: ([0-9.]+)$')
+        for line in output.splitlines():
+            m = fom_regex.match(line)
+            if (m is not None):
+                current = float(m.groups()[0])
+                break
+
+        expect(current is not None, "No lines in output matched FOM regex")
+
+        if (best is None or best < current):
+            best = current
+
+    return best
+
+###############################################################################
+def wait_for_quiet_machine():
+###############################################################################
+    while(True):
+        time.sleep(2)
+
+        # The first iteration of top gives garbage results
+        idle_pct_raw = run_cmd("top -bn2 | grep 'Cpu(s)' | tr ',' ' ' | tail -n 1 | awk '{print $5}'")
+
+        idle_pct_re = re.compile(r'^([0-9.]+)%id$')
+        m = idle_pct_re.match(idle_pct_raw)
+
+        expect(m is not None, "top not returning output in expected form")
+
+        idle_pct = float(m.groups()[0])
+        if (idle_pct < 95):
+            error_print("Machine is too busy, waiting for it to become free")
+        else:
+            break
+
+###############################################################################
+def add_n_dupes(curr_lines, num_dupes, template):
+###############################################################################
+    function_name  = None
+    function_invocation = None
+    function_lines = []
+
+    function_re = re.compile(r'^.* (\w+) *[(]')
+    function_inv_re = re.compile(r'^.*JGF_DUPE: +(.+)$')
+
+    # Get function lines
+    record = False
+    definition_insertion_point = None
+    invocation_insertion_point = None
+    for idx, line in enumerate(curr_lines):
+        if ("JGF_DUPLICATE_BEGIN" in line):
+            record = True
+            m = function_re.match(curr_lines[idx+1])
+            expect(m is not None, "Could not find function in line '%s'" % curr_lines[idx+1])
+            function_name = m.groups()[0]
+
+        elif ("JGF_DUPLICATE_END" in line):
+            record = False
+            definition_insertion_point = idx + 1
+
+        elif (record):
+            function_lines.append(line)
+
+        elif ("JGF_DUPE" in line):
+            m = function_inv_re.match(line)
+            expect(m is not None, "Could not find function invocation example in line '%s'" % line)
+            function_invocation = m.groups()[0]
+            invocation_insertion_point = idx + 1
+
+    expect(function_name is not None, "Could not find name of dupe function")
+    expect(function_invocation is not None, "Could not find function invocation point")
+
+    expect(definition_insertion_point < invocation_insertion_point, "fix me")
+
+    dupe_func_defs = []
+    dupe_invocations = ["int jgf_rand = std::rand();\n", "if (false) {}\n"]
+
+    for i in xrange(num_dupes):
+        if (not template):
+            dupe_func = list(function_lines)
+            dupe_func[0] = dupe_func[0].replace(function_name, "%s%d" % (function_name, i))
+            dupe_func_defs.extend(dupe_func)
+
+        dupe_invocations.append("else if (jgf_rand == %d) " % i)
+        if (template):
+            dupe_call = function_invocation.replace(function_name, "%s<%d>" % (function_name, i)) + "\n"
+        else:
+            dupe_call = function_invocation.replace(function_name, "%s%d" % (function_name, i))  + "\n"
+        dupe_invocations.append(dupe_call)
+
+    curr_lines[invocation_insertion_point:invocation_insertion_point] = dupe_invocations
+    curr_lines[definition_insertion_point:definition_insertion_point] = dupe_func_defs
+
+###############################################################################
+def report(num_dupes, curr_lines, object_file, orig_fom, curr_fom, csv=False, is_first_report=False):
+###############################################################################
+    fom_change = (curr_fom - orig_fom) / orig_fom
+
+    if (csv):
+        if (is_first_report):
+            print "num_dupes, obj_byte_size, loc, fom, pct_diff"
+
+        print "%s, %s, %s, %s, %s" % (num_dupes, os.path.getsize(object_file), len(curr_lines), curr_fom, fom_change*100)
+    else:
+        print "========================================================"
+        print "For number of dupes:", num_dupes
+        print "Object file size (bytes):", os.path.getsize(object_file)
+        print "Lines of code:", len(curr_lines)
+        print "Field of merit:", curr_fom
+        print "Change pct:", fom_change*100
+
+###############################################################################
+def obj_size_opt_check(cppfile, buildcmd, execmd, start, end, repeat, template, csv=False):
+###############################################################################
+    orig_source_lines = open(cppfile, 'r').readlines()
+
+    backup_file = "%s.orig" % cppfile
+    object_file = "%s.o" % os.path.splitext(cppfile)[0]
+    os.rename(cppfile, backup_file)
+
+    orig_fom = build_and_run(orig_source_lines, cppfile, buildcmd, execmd, repeat)
+    report(0, orig_source_lines, object_file, orig_fom, orig_fom, csv=csv, is_first_report=True)
+
+    i = start
+    while (i < end):
+        curr_lines = list(orig_source_lines)
+        add_n_dupes(curr_lines, i, template)
+
+        curr_fom = build_and_run(curr_lines, cppfile, buildcmd, execmd, repeat)
+
+        report(i, curr_lines, object_file, orig_fom, curr_fom, csv=csv)
+
+        i *= 2 # make growth function configurable?
+
+    os.remove(cppfile)
+    os.rename(backup_file, cppfile)
+
+###############################################################################
+def _main_func(description):
+###############################################################################
+    if ("--test" in sys.argv):
+        test_results = doctest.testmod(verbose=True)
+        sys.exit(1 if test_results.failed > 0 else 0)
+
+    cppfile, buildcmd, execmd, start, end, repeat, template, csv = parse_command_line(sys.argv, description)
+
+    obj_size_opt_check(cppfile, buildcmd, execmd, start, end, repeat, template, csv)
+
+###############################################################################
+if (__name__ == "__main__"):
+    _main_func(__doc__)
diff --git a/packages/kokkos/scripts/testing_scripts/test_kokkos_master_develop_promotion.sh b/packages/kokkos/scripts/testing_scripts/test_kokkos_master_develop_promotion.sh
new file mode 100755
index 0000000000000000000000000000000000000000..048f48194ce7bfe6f773c7f8ee289f76f8f16cb6
--- /dev/null
+++ b/packages/kokkos/scripts/testing_scripts/test_kokkos_master_develop_promotion.sh
@@ -0,0 +1,66 @@
+#!/bin/bash 
+
+. /etc/profile.d/modules.sh
+
+echo "build-dir $1"
+echo "backend $2"
+echo "module $3"
+echo "compiler $4"
+echo "cxxflags $5"
+echo "architecrure $6"
+echo "debug $7"
+echo "kokkos-options $8"
+echo "kokkos-cuda-options $9"
+echo "hwloc $9"
+
+NOW=`date "+%Y%m%d%H%M%S"`
+BASEDIR="$1-$NOW"
+
+mkdir $BASEDIR
+cd $BASEDIR
+
+module load $2
+
+if [ $9 == "yes" ]; then
+if [ $7 == "debug" ]; then
+  ../generate_makefile.sh --with-devices=$2 \
+  	--compiler=$4 \
+	--cxxflags=$5 \
+        --arch=$6 \
+        --debug \
+	--with-options=$8 \
+        --with-cuda-options=$9
+        --with-hwloc=${HWLOC_ROOT}
+else
+    ../generate_makefile.sh --with-devices=$2 \
+        --compiler=$4 \
+        --cxxflags=$5 \
+        --arch=$6 \
+        --debug \
+        --with-options=$8 \
+        --with-cuda-options=$9 
+        --with-hwloc=${HWLOC_ROOT}
+fi
+else
+if [ $7 == "debug" ]; then
+  ../generate_makefile.sh --with-devices=$2 \
+        --compiler=$4 \
+        --cxxflags=$5 \
+        --arch=$6 \
+        --debug \
+        --with-options=$8 \
+        --with-cuda-options=$9
+else
+    ../generate_makefile.sh --with-devices=$2 \
+        --compiler=$4 \
+        --cxxflags=$5 \
+        --arch=$6 \
+        --debug \
+        --with-options=$8 \
+        --with-cuda-options=$9 
+fi
+fi
+
+
+make test
+return $?
diff --git a/packages/kokkos/scripts/trilinos-integration/checkin-test b/packages/kokkos/scripts/trilinos-integration/checkin-test
new file mode 100644
index 0000000000000000000000000000000000000000..ffb565fcbbbb85f881053828d34208bd8e4b9e7e
--- /dev/null
+++ b/packages/kokkos/scripts/trilinos-integration/checkin-test
@@ -0,0 +1,4 @@
+module purge
+module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu
+
+#Run Trilinos CheckinTest
diff --git a/packages/kokkos/scripts/trilinos-integration/prepare_trilinos_repos.sh b/packages/kokkos/scripts/trilinos-integration/prepare_trilinos_repos.sh
new file mode 100755
index 0000000000000000000000000000000000000000..31b2ad21bdc1b8daea4d67ab7b203f47c614dc42
--- /dev/null
+++ b/packages/kokkos/scripts/trilinos-integration/prepare_trilinos_repos.sh
@@ -0,0 +1,59 @@
+#!/bin/bash -le
+
+TRILINOS_UPDATE_BRANCH=$1
+TRILINOS_PRISTINE_BRANCH=$2
+
+if [ -z $TRILINOS_UPDATE_BRANCH ]
+then
+  TRILINOS_UPDATE_BRANCH=develop
+fi
+
+if [ -z $TRILINOS_PRISTINE_BRANCH ]
+then
+  TRILINOS_PRISTINE_BRANCH=develop
+fi
+
+export TRILINOS_UPDATED_PATH=${PWD}/trilinos-update
+export TRILINOS_PRISTINE_PATH=${PWD}/trilinos-pristine
+
+#rm -rf ${KOKKOS_PATH}
+#rm -rf ${TRILINOS_UPDATED_PATH}
+#rm -rf ${TRILINOS_PRISTINE_PATH}
+
+#Already done:
+if [ ! -d "${TRILINOS_UPDATED_PATH}" ]; then
+  git clone https://github.com/trilinos/trilinos ${TRILINOS_UPDATED_PATH}
+fi
+if [ ! -d "${TRILINOS_PRISTINE_PATH}" ]; then
+  git clone https://github.com/trilinos/trilinos ${TRILINOS_PRISTINE_PATH}
+fi
+
+cd ${TRILINOS_UPDATED_PATH}
+git checkout $TRILINOS_UPDATE_BRANCH
+git reset --hard origin/$TRILINOS_UPDATE_BRANCH
+git pull
+cd ..
+
+python kokkos/scripts/snapshot.py ${KOKKOS_PATH} ${TRILINOS_UPDATED_PATH}/packages
+
+cd ${TRILINOS_UPDATED_PATH}
+echo ""
+echo ""
+echo "Trilinos State:"
+git log --pretty=oneline --since=7.days
+cd ..
+
+cd ${TRILINOS_PRISTINE_PATH}
+git status
+echo "Checkout $TRILINOS_PRISTINE_BRANCH"
+git checkout $TRILINOS_PRISTINE_BRANCH
+echo "Pull"
+git pull
+cd ..
+
+cd ${TRILINOS_PRISTINE_PATH}
+echo ""
+echo ""
+echo "Trilinos Pristine State:"
+git log --pretty=oneline --since=7.days
+cd ..
diff --git a/packages/kokkos/scripts/trilinos-integration/shepard_jenkins_run_script_pthread_intel b/packages/kokkos/scripts/trilinos-integration/shepard_jenkins_run_script_pthread_intel
new file mode 100755
index 0000000000000000000000000000000000000000..3b2c7255177c3b004b54cef3629a56aed46d9798
--- /dev/null
+++ b/packages/kokkos/scripts/trilinos-integration/shepard_jenkins_run_script_pthread_intel
@@ -0,0 +1,60 @@
+#!/bin/bash -el
+ulimit -c 0
+module load devpack/openmpi/2.1.1/intel/17.4.196/cuda/none
+
+KOKKOS_BRANCH=$1
+TRILINOS_UPDATE_BRANCH=$2
+TRILINOS_PRISTINE_BRANCH=$3
+
+if [ -z $KOKKOS_BRANCH ]
+then
+  KOKKOS_BRANCH=develop
+fi
+
+if [ -z $TRILINOS_UPDATE_BRANCH ]
+then
+  TRILINOS_UPDATE_BRANCH=develop
+fi
+
+if [ -z $TRILINOS_PRISTINE_BRANCH ]
+then
+  TRILINOS_PRISTINE_BRANCH=develop
+fi
+
+export OMP_NUM_THREADS=8
+export JENKINS_DO_CUDA=OFF
+export JENKINS_DO_OPENMP=OFF
+export JENKINS_DO_PTHREAD=ON
+export JENKINS_DO_SERIAL=OFF
+export JENKINS_DO_COMPLEX=OFF
+
+export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
+export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
+export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
+
+export JENKINS_DO_TESTS=ON
+export JENKINS_DO_EXAMPLES=ON
+export JENKINS_DO_SHARED=ON
+
+export QUEUE=haswell
+
+
+module load python
+
+
+export KOKKOS_PATH=${PWD}/kokkos
+
+#Already done:
+if [ ! -d "${KOKKOS_PATH}" ]; then
+  git clone https://github.com/kokkos/kokkos ${KOKKOS_PATH}
+fi
+
+cd ${KOKKOS_PATH}
+git checkout $KOKKOS_BRANCH
+git pull
+cd ..
+
+source ${KOKKOS_PATH}/scripts/trilinos-integration/prepare_trilinos_repos.sh $TRILINOS_UPDATE_BRANCH $TRILINOS_PRISTINE_BRANCH
+
+${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/run_repo_comparison_slurm ${TRILINOS_UPDATED_PATH} ${TRILINOS_PRISTINE_PATH} ${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/configure-testbeds-jenkins-all TestCompare ${QUEUE}
diff --git a/packages/kokkos/scripts/trilinos-integration/shepard_jenkins_run_script_serial_intel b/packages/kokkos/scripts/trilinos-integration/shepard_jenkins_run_script_serial_intel
new file mode 100755
index 0000000000000000000000000000000000000000..9ce936ae2634b1b8b7375f70d241c74189c3ae40
--- /dev/null
+++ b/packages/kokkos/scripts/trilinos-integration/shepard_jenkins_run_script_serial_intel
@@ -0,0 +1,60 @@
+#!/bin/bash -el
+ulimit -c 0
+module load devpack/openmpi/2.1.1/intel/17.4.196/cuda/none
+
+KOKKOS_BRANCH=$1
+TRILINOS_UPDATE_BRANCH=$2
+TRILINOS_PRISTINE_BRANCH=$3
+
+if [ -z $KOKKOS_BRANCH ]
+then
+  KOKKOS_BRANCH=develop
+fi
+
+if [ -z $TRILINOS_UPDATE_BRANCH ]
+then
+  TRILINOS_UPDATE_BRANCH=develop
+fi
+
+if [ -z $TRILINOS_PRISTINE_BRANCH ]
+then
+  TRILINOS_PRISTINE_BRANCH=develop
+fi
+
+export OMP_NUM_THREADS=8
+export JENKINS_DO_CUDA=OFF
+export JENKINS_DO_OPENMP=OFF
+export JENKINS_DO_PTHREAD=OFF
+export JENKINS_DO_SERIAL=ON
+export JENKINS_DO_COMPLEX=ON
+
+export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
+export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
+export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
+
+export JENKINS_DO_TESTS=ON
+export JENKINS_DO_EXAMPLES=ON
+export JENKINS_DO_SHARED=ON
+
+export QUEUE=haswell
+
+
+module load python
+
+
+export KOKKOS_PATH=${PWD}/kokkos
+
+#Already done:
+if [ ! -d "${KOKKOS_PATH}" ]; then
+  git clone https://github.com/kokkos/kokkos ${KOKKOS_PATH}
+fi
+
+cd ${KOKKOS_PATH}
+git checkout $KOKKOS_BRANCH
+git pull
+cd ..
+
+source ${KOKKOS_PATH}/scripts/trilinos-integration/prepare_trilinos_repos.sh $TRILINOS_UPDATE_BRANCH $TRILINOS_PRISTINE_BRANCH
+
+${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/run_repo_comparison_slurm ${TRILINOS_UPDATED_PATH} ${TRILINOS_PRISTINE_PATH} ${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/configure-testbeds-jenkins-all TestCompare ${QUEUE}
diff --git a/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_cuda b/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_cuda
new file mode 100755
index 0000000000000000000000000000000000000000..2716767fe5dd7ed479843040a32e591fa4439bf3
--- /dev/null
+++ b/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_cuda
@@ -0,0 +1,63 @@
+#!/bin/bash -el
+ulimit -c 0
+
+KOKKOS_BRANCH=$1
+TRILINOS_UPDATE_BRANCH=$2
+TRILINOS_PRISTINE_BRANCH=$3
+
+if [ -z $KOKKOS_BRANCH ]
+then
+  KOKKOS_BRANCH=develop
+fi
+
+if [ -z $TRILINOS_UPDATE_BRANCH ]
+then
+  TRILINOS_UPDATE_BRANCH=develop
+fi
+
+if [ -z $TRILINOS_PRISTINE_BRANCH ]
+then
+  TRILINOS_PRISTINE_BRANCH=develop
+fi
+
+module load devpack/openmpi/1.10.4/gcc/5.4.0/cuda/8.0.44
+export OMP_NUM_THREADS=8
+export JENKINS_DO_CUDA=ON
+export JENKINS_DO_OPENMP=OFF
+export JENKINS_DO_PTHREAD=OFF
+export JENKINS_DO_SERIAL=ON
+export JENKINS_DO_COMPLEX=OFF
+
+export JENKINS_ARCH_CXX_FLAG="-mcpu=power8 -arch=sm_37"
+export JENKINS_ARCH_C_FLAG="-mcpu=power8"
+export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a;gfortran;gomp"
+export LAPACK_LIBRARIES="${LAPACK_ROOT}/lib/liblapack.a;gfortran;gomp"
+
+export JENKINS_DO_TESTS=ON
+export JENKINS_DO_EXAMPLES=ON
+
+export QUEUE=rhel7F
+
+module load python
+
+export KOKKOS_PATH=${PWD}/kokkos
+
+#Already done:
+if [ ! -d "${KOKKOS_PATH}" ]; then
+  git clone https://github.com/kokkos/kokkos ${KOKKOS_PATH}
+fi
+
+export OMPI_CXX=${KOKKOS_PATH}/bin/nvcc_wrapper
+
+cd ${KOKKOS_PATH}
+git checkout $KOKKOS_BRANCH
+git pull
+cd ..
+
+export CUDA_LAUNCH_BLOCKING=1
+export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
+
+source ${KOKKOS_PATH}/scripts/trilinos-integration/prepare_trilinos_repos.sh $TRILINOS_UPDATE_BRANCH $TRILINOS_PRISTINE_BRANCH
+
+${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/run_repo_comparison_lsf ${TRILINOS_UPDATED_PATH} ${TRILINOS_PRISTINE_PATH} ${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/configure-testbeds-jenkins-all TestCompare ${QUEUE}
+
diff --git a/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_omp b/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_omp
new file mode 100755
index 0000000000000000000000000000000000000000..ff1086507ce50d21d17292f29cb462f24e24def1
--- /dev/null
+++ b/packages/kokkos/scripts/trilinos-integration/white_run_jenkins_script_omp
@@ -0,0 +1,58 @@
+#!/bin/bash -el
+ulimit -c 0
+
+KOKKOS_BRANCH=$1
+TRILINOS_UPDATE_BRANCH=$2
+TRILINOS_PRISTINE_BRANCH=$3
+
+if [ -z $KOKKOS_BRANCH ]
+then
+  KOKKOS_BRANCH=develop
+fi
+
+if [ -z $TRILINOS_UPDATE_BRANCH ]
+then
+  TRILINOS_UPDATE_BRANCH=develop
+fi
+
+if [ -z $TRILINOS_PRISTINE_BRANCH ]
+then
+  TRILINOS_PRISTINE_BRANCH=develop
+fi
+
+module load devpack/openmpi/1.10.4/gcc/5.4.0/cuda/8.0.44
+export OMP_NUM_THREADS=8
+export JENKINS_DO_CUDA=OFF
+export JENKINS_DO_OPENMP=ON
+export JENKINS_DO_PTHREAD=OFF
+export JENKINS_DO_SERIAL=OFF
+export JENKINS_DO_COMPLEX=OFF
+
+export JENKINS_ARCH_CXX_FLAG="-mcpu=power8"
+export JENKINS_ARCH_C_FLAG="-mcpu=power8"
+export BLAS_LIBRARIES="${BLAS_ROOT}/lib/libblas.a;gfortran;gomp"
+export LAPACK_LIBRARIES="${LAPACK_ROOT}/lib/liblapack.a;gfortran;gomp"
+
+export JENKINS_DO_TESTS=ON
+export JENKINS_DO_EXAMPLES=ON
+
+export QUEUE=rhel7F
+
+module load python
+
+export KOKKOS_PATH=${PWD}/kokkos
+
+#Already done:
+if [ ! -d "${KOKKOS_PATH}" ]; then
+  git clone https://github.com/kokkos/kokkos ${KOKKOS_PATH}
+fi
+
+cd ${KOKKOS_PATH}
+git checkout $KOKKOS_BRANCH
+git pull
+cd ..
+
+source ${KOKKOS_PATH}/scripts/trilinos-integration/prepare_trilinos_repos.sh $TRILINOS_UPDATE_BRANCH $TRILINOS_PRISTINE_BRANCH
+
+${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/run_repo_comparison_lsf ${TRILINOS_UPDATED_PATH} ${TRILINOS_PRISTINE_PATH} ${TRILINOS_UPDATED_PATH}/sampleScripts/Sandia-SEMS/configure-testbeds-jenkins-all TestCompare ${QUEUE}
+
diff --git a/packages/kokkos/tpls/gtest/gtest/LICENSE b/packages/kokkos/tpls/gtest/gtest/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..1941a11f8ce94389160b458927a29ba217542818
--- /dev/null
+++ b/packages/kokkos/tpls/gtest/gtest/LICENSE
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/packages/kokkos/tpls/gtest/gtest/README b/packages/kokkos/tpls/gtest/gtest/README
new file mode 100644
index 0000000000000000000000000000000000000000..82964ecc329b474002c66cf534999519e8fc39a3
--- /dev/null
+++ b/packages/kokkos/tpls/gtest/gtest/README
@@ -0,0 +1,13 @@
+This is a fused source version of gtest 1.7.0. All that should be necessary to
+start using gtest in your package is to declare the dependency and include
+gtest/gtest.h.
+
+However, because some of the packages that are developed in Sierra do not use a
+fused source version of gtest we need to make it possible for them to build with
+this version as well as with their native build. To facilitate this we have
+created symlinks for the other gtest headers that they use to the fused source
+gtest.h. This will make it possible for them find the headers while still using
+the fuse source version. This should not have any ill effects since the header is
+protected and allows for only using the non-gtest.h headers in their files.
+
+
diff --git a/packages/kokkos/tpls/gtest/gtest/gtest-all.cc b/packages/kokkos/tpls/gtest/gtest/gtest-all.cc
new file mode 100644
index 0000000000000000000000000000000000000000..735f581c95bb13a77d12447a2eafd7bc830b35fb
--- /dev/null
+++ b/packages/kokkos/tpls/gtest/gtest/gtest-all.cc
@@ -0,0 +1,9594 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// Google C++ Testing Framework (Google Test)
+//
+// Sometimes it's desirable to build Google Test by compiling a single file.
+// This file serves this purpose.
+
+// This line ensures that gtest.h can be compiled on its own, even
+// when it's fused.
+#include "gtest/gtest.h"
+
+// The following lines pull in the real gtest *.cc files.
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  // The two possible mocking modes of this object.
+  enum InterceptMode {
+    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
+    INTERCEPT_ALL_THREADS           // Intercepts all failures.
+  };
+
+  // The c'tor sets this object as the test part result reporter used
+  // by Google Test.  The 'result' parameter specifies where to report the
+  // results. This reporter will only catch failures generated in the current
+  // thread. DEPRECATED
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+
+  // Same as above, but you can choose the interception scope of this object.
+  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+                                   TestPartResultArray* result);
+
+  // The d'tor restores the previous test part result reporter.
+  virtual ~ScopedFakeTestPartResultReporter();
+
+  // Appends the TestPartResult object to the TestPartResultArray
+  // received in the constructor.
+  //
+  // This method is from the TestPartResultReporterInterface
+  // interface.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+ private:
+  void Init();
+
+  const InterceptMode intercept_mode_;
+  TestPartResultReporterInterface* old_reporter_;
+  TestPartResultArray* const result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+  // The constructor remembers the arguments.
+  SingleFailureChecker(const TestPartResultArray* results,
+                       TestPartResult::Type type,
+                       const string& substr);
+  ~SingleFailureChecker();
+ private:
+  const TestPartResultArray* const results_;
+  const TestPartResult::Type type_;
+  const string substr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures.  It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - 'statement' cannot reference local non-static variables or
+//     non-static members of the current object.
+//   - 'statement' cannot return a value.
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ALL_THREADS, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures.  It asserts that the given
+// statement will cause exactly one non-fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma.  The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+//   if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <vector>
+
+#if GTEST_OS_LINUX
+
+// TODO(kenton@google.com): Use autoconf to detect availability of
+// gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+# include <fcntl.h>  // NOLINT
+# include <limits.h>  // NOLINT
+# include <sched.h>  // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+# include <strings.h>  // NOLINT
+# include <sys/mman.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+# include <string>
+
+#elif GTEST_OS_SYMBIAN
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+#elif GTEST_OS_ZOS
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+# include <strings.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+# include <windows.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+# include <io.h>  // NOLINT
+# include <sys/timeb.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+# include <sys/stat.h>  // NOLINT
+
+# if GTEST_OS_WINDOWS_MINGW
+// MinGW has gettimeofday() but not _ftime64().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+// TODO(kenton@google.com): There are other ways to get the time on
+//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
+//   supports these.  consider using them instead.
+#  define GTEST_HAS_GETTIMEOFDAY_ 1
+#  include <sys/time.h>  // NOLINT
+# endif  // GTEST_OS_WINDOWS_MINGW
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <windows.h>  // NOLINT
+
+#else
+
+// Assume other platforms have gettimeofday().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
+#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
+// part of Google Test's implementation; otherwise it's undefined.
+#if !GTEST_IMPLEMENTATION_
+// A user is trying to include this from his code - just say no.
+# error "gtest-internal-inl.h is part of Google Test's internal implementation."
+# error "It must not be included except by Google Test itself."
+#endif  // GTEST_IMPLEMENTATION_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+
+namespace testing {
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
+const char kBreakOnFailureFlag[] = "break_on_failure";
+const char kCatchExceptionsFlag[] = "catch_exceptions";
+const char kColorFlag[] = "color";
+const char kFilterFlag[] = "filter";
+const char kListTestsFlag[] = "list_tests";
+const char kOutputFlag[] = "output";
+const char kPrintTimeFlag[] = "print_time";
+const char kRandomSeedFlag[] = "random_seed";
+const char kRepeatFlag[] = "repeat";
+const char kShuffleFlag[] = "shuffle";
+const char kStackTraceDepthFlag[] = "stack_trace_depth";
+const char kStreamResultToFlag[] = "stream_result_to";
+const char kThrowOnFailureFlag[] = "throw_on_failure";
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true iff Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseInt32Flag(
+    const char* str, const char* flag, Int32* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
+  const unsigned int raw_seed = (random_seed_flag == 0) ?
+      static_cast<unsigned int>(GetTimeInMillis()) :
+      static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
+    color_ = GTEST_FLAG(color);
+    death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    filter_ = GTEST_FLAG(filter);
+    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
+    output_ = GTEST_FLAG(output);
+    print_time_ = GTEST_FLAG(print_time);
+    random_seed_ = GTEST_FLAG(random_seed);
+    repeat_ = GTEST_FLAG(repeat);
+    shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
+    GTEST_FLAG(color) = color_;
+    GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
+    GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
+    GTEST_FLAG(output) = output_;
+    GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(random_seed) = random_seed_;
+    GTEST_FLAG(repeat) = repeat_;
+    GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
+    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+  }
+
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
+  bool death_test_use_fork_;
+  std::string filter_;
+  std::string internal_run_death_test_;
+  bool list_tests_;
+  std::string output_;
+  bool print_time_;
+  internal::Int32 random_seed_;
+  internal::Int32 repeat_;
+  bool shuffle_;
+  internal::Int32 stack_trace_depth_;
+  std::string stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error and
+// and aborts.
+GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(
+    int total_shards, int shard_index, int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it))
+      ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected = begin + random->Generate(range_width);
+    std::swap((*v)[selected], (*v)[last_in_range]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+
+  // Returns true iff the test name of test property matches on key_.
+  bool operator()(const TestProperty& test_property) const {
+    return test_property.key() == key_;
+  }
+
+ private:
+  std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static std::string GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static std::string GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true iff the wildcard pattern matches the string.  The
+  // first ':' or '\0' character in pattern marks the end of it.
+  //
+  // This recursive algorithm isn't very efficient, but is clear and
+  // works well enough for matching test names, which are short.
+  static bool PatternMatchesString(const char *pattern, const char *str);
+
+  // Returns true iff the user-specified filter matches the test case
+  // name and the test name.
+  static bool FilterMatchesTest(const std::string &test_case_name,
+                                const std::string &test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const std::string& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface() {}
+
+  // Returns the current OS stack trace as an std::string.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() : caller_frame_(NULL) {}
+
+  virtual string CurrentStackTrace(int max_depth, int skip_count)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
+ private:
+  Mutex mutex_;  // protects all internal state
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to CurrentStackTrace() from within the user code.
+  void* caller_frame_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char* file;
+  int line;
+  std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+  : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest* parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface* reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface* reporter);
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[i];
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i) {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[index];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners* listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult* current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface* os_stack_trace_getter();
+
+  // Returns the current OS stack trace as an std::string.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+
+  // Finds and returns a TestCase with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_case_name: name of the test case
+  //   type_param:     the name of the test's type parameter, or NULL if
+  //                   this is not a typed or a type-parameterized test.
+  //   set_up_tc:      pointer to the function that sets up the test case
+  //   tear_down_tc:   pointer to the function that tears down the test case
+  TestCase* GetTestCase(const char* test_case_name,
+                        const char* type_param,
+                        Test::SetUpTestCaseFunc set_up_tc,
+                        Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  //   test_info:    the TestInfo object
+  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc,
+                   TestInfo* test_info) {
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+
+    GetTestCase(test_info->test_case_name(),
+                test_info->type_param(),
+                set_up_tc,
+                tear_down_tc)->AddTestInfo(test_info);
+  }
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Sets the TestCase object for the test that's currently running.
+  void set_current_test_case(TestCase* a_current_test_case) {
+    current_test_case_ = a_current_test_case;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo* a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_cases_, TestCase::ClearTestCaseResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() {
+    ad_hoc_test_result_.Clear();
+  }
+
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test case, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty& test_property);
+
+  enum ReactionToSharding {
+    HONOR_SHARDING_PROTOCOL,
+    IGNORE_SHARDING_PROTOCOL
+  };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestCase and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestCase* current_test_case() const { return current_test_case_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*>& environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo>& gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory* death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random* random() { return &random_; }
+
+  // Shuffles all test cases, and the tests within each test case,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test cases and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest* const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface*>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*> environments_;
+
+  // The vector of TestCases in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestCase*> test_cases_;
+
+  // Provides a level of indirection for the test case list to allow
+  // easy shuffling and restoring the test case order.  The i-th
+  // element of this vector is the index of the i-th test case in the
+  // shuffled order.
+  std::vector<int> test_case_indices_;
+
+#if GTEST_HAS_PARAM_TEST
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Index of the last death test case registered.  Initially -1.
+  int last_death_test_case_;
+
+  // This points to the TestCase for the currently running test.  It
+  // changes as Google Test goes through one test case after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestCase* current_test_case_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo* current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+  // True iff PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+# if GTEST_OS_WINDOWS
+// Provides leak-safe Windows kernel handle ownership.
+class AutoHandle {
+ public:
+  AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
+  explicit AutoHandle(HANDLE handle) : handle_(handle) {}
+
+  ~AutoHandle() { Reset(); }
+
+  HANDLE Get() const { return handle_; }
+  void Reset() { Reset(INVALID_HANDLE_VALUE); }
+  void Reset(HANDLE handle) {
+    if (handle != handle_) {
+      if (handle_ != INVALID_HANDLE_VALUE)
+        ::CloseHandle(handle_);
+      handle_ = handle;
+    }
+  }
+
+ private:
+  HANDLE handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+# endif  // GTEST_OS_WINDOWS
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char* end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+
+# if GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  // MSVC and C++ Builder define __int64 instead of the standard long long.
+  typedef unsigned __int64 BiggestConvertible;
+  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
+
+# else
+
+  typedef unsigned long long BiggestConvertible;  // NOLINT
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
+
+# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  // TODO(vladl@google.com): Convert this to compile time assertion when it is
+  // available.
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
+                             const TestProperty& property) {
+    test_result->RecordProperty(xml_element, property);
+  }
+
+  static void ClearTestPartResults(TestResult* test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const string& message) {
+      Send(message + "\n");
+    }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const string& host, const string& port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    virtual ~SocketWriter() {
+      if (sockfd_ != -1)
+        CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const int len = static_cast<int>(message.length());
+      if (write(sockfd_, message.c_str(), len) != len) {
+        GTEST_LOG_(WARNING)
+            << "stream_result_to: failed to stream to "
+            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const string host_name_;
+    const string port_num_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static string UrlEncode(const char* str);
+
+  StreamingListener(const string& host, const string& port)
+      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) { Start(); }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
+    SendLn("event=TestIterationEnd&passed=" +
+           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
+           StreamableToString(unit_test.elapsed_time()) + "ms");
+  }
+
+  void OnTestCaseStart(const TestCase& test_case) {
+    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  }
+
+  void OnTestCaseEnd(const TestCase& test_case) {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
+           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
+           + "ms");
+  }
+
+  void OnTestStart(const TestInfo& test_info) {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo& test_info) {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) +
+           "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == NULL)
+      file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const scoped_ptr<AbstractSocketWriter> socket_writer_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
+#undef GTEST_IMPLEMENTATION_
+
+#if GTEST_OS_WINDOWS
+# define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test case name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test case whose name matches this filter is considered a death
+// test case and will be run before test cases whose name doesn't
+// match this filter.
+static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output file for XML output.
+static const char kDefaultOutputFile[] = "test_detail.xml";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+bool g_help_flag = false;
+
+}  // namespace internal
+
+static const char* GetDefaultFilter() {
+  return kUniversalFilter;
+}
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure,
+    internal::BoolFromGTestEnv("break_on_failure", false),
+    "True iff a failed assertion should be a debugger break-point.");
+
+GTEST_DEFINE_bool_(
+    catch_exceptions,
+    internal::BoolFromGTestEnv("catch_exceptions", true),
+    "True iff " GTEST_NAME_
+    " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color,
+    internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+    filter,
+    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(list_tests, false,
+                   "List all tests without running them.");
+
+GTEST_DEFINE_string_(
+    output,
+    internal::StringFromGTestEnv("output", ""),
+    "A format (currently must be \"xml\"), optionally followed "
+    "by a colon and an output file name or directory. A directory "
+    "is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(
+    print_time,
+    internal::BoolFromGTestEnv("print_time", true),
+    "True iff " GTEST_NAME_
+    " should display elapsed time in text output.");
+
+GTEST_DEFINE_int32_(
+    random_seed,
+    internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat,
+    internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+    show_internal_stack_frames, false,
+    "True iff " GTEST_NAME_ " should include internal stack frames when "
+    "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(
+    shuffle,
+    internal::BoolFromGTestEnv("shuffle", false),
+    "True iff " GTEST_NAME_
+    " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to,
+    internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure,
+    internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise.");
+
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+UInt32 Random::Generate(UInt32 range) {
+  // These constants are the same as are used in glibc's rand(3).
+  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0)
+      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true iff the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+//
+// A user must call testing::InitGoogleTest() to initialize Google
+// Test.  g_init_gtest_count is set to the number of times
+// InitGoogleTest() has been called.  We don't protect this variable
+// under a mutex as it is only accessed in the main thread.
+GTEST_API_ int g_init_gtest_count = 0;
+static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
+
+// Iterates over a vector of TestCases, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
+                               int (TestCase::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true iff the test case passed.
+static bool TestCasePassed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Passed();
+}
+
+// Returns true iff the test case failed.
+static bool TestCaseFailed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Failed();
+}
+
+// Returns true iff test_case contains at least one test that should
+// run.
+static bool ShouldRunTestCase(const TestCase* test_case) {
+  return test_case->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type,
+                           const char* file,
+                           int line,
+                           const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {
+}
+
+AssertHelper::~AssertHelper() {
+  delete data_;
+}
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->
+    AddTestPartResult(data_->type, data_->file, data_->line,
+                      AppendUserMessage(data_->message, message),
+                      UnitTest::GetInstance()->impl()
+                      ->CurrentOsStackTraceExceptTop(1)
+                      // Skips the stack frame for this function itself.
+                      );  // NOLINT
+}
+
+// Mutex for linked pointers.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// Application pathname gotten in InitGoogleTest.
+std::string g_executable_path;
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS
+  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(g_executable_path));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL) return std::string("");
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  return (colon == NULL) ?
+      std::string(gtest_output_flag) :
+      std::string(gtest_output_flag, colon - gtest_output_flag);
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL)
+    return "";
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  if (colon == NULL)
+    return internal::FilePath::ConcatPaths(
+        internal::FilePath(
+            UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(kDefaultOutputFile)).string();
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    // TODO(wan@google.com): on Windows \some\path is not an absolute
+    // path (as its meaning depends on the current drive), yet the
+    // following logic for turning it into an absolute path is wrong.
+    // Fix it.
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory())
+    return output_name.string();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.string();
+}
+
+// Returns true iff the wildcard pattern matches the string.  The
+// first ':' or '\0' character in pattern marks the end of it.
+//
+// This recursive algorithm isn't very efficient, but is clear and
+// works well enough for matching test names, which are short.
+bool UnitTestOptions::PatternMatchesString(const char *pattern,
+                                           const char *str) {
+  switch (*pattern) {
+    case '\0':
+    case ':':  // Either ':' or '\0' marks the end of the pattern.
+      return *str == '\0';
+    case '?':  // Matches any single character.
+      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
+    case '*':  // Matches any string (possibly empty) of characters.
+      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
+          PatternMatchesString(pattern + 1, str);
+    default:  // Non-special character.  Matches itself.
+      return *pattern == *str &&
+          PatternMatchesString(pattern + 1, str + 1);
+  }
+}
+
+bool UnitTestOptions::MatchesFilter(
+    const std::string& name, const char* filter) {
+  const char *cur_pattern = filter;
+  for (;;) {
+    if (PatternMatchesString(cur_pattern, name.c_str())) {
+      return true;
+    }
+
+    // Finds the next pattern in the filter.
+    cur_pattern = strchr(cur_pattern, ':');
+
+    // Returns if no more pattern can be found.
+    if (cur_pattern == NULL) {
+      return false;
+    }
+
+    // Skips the pattern separater (the ':' character).
+    cur_pattern++;
+  }
+}
+
+// Returns true iff the user-specified filter matches the test case
+// name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
+                                        const std::string &test_name) {
+  const std::string& full_name = test_case_name + "." + test_name.c_str();
+
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  const char* const p = GTEST_FLAG(filter).c_str();
+  const char* const dash = strchr(p, '-');
+  std::string positive;
+  std::string negative;
+  if (dash == NULL) {
+    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
+    negative = "";
+  } else {
+    positive = std::string(p, dash);   // Everything up to the dash
+    negative = std::string(dash + 1);  // Everything after the dash
+    if (positive.empty()) {
+      // Treat '-test1' as the same as '*-test1'
+      positive = kUniversalFilter;
+    }
+  }
+
+  // A filter is a colon-separated list of patterns.  It matches a
+  // test if any pattern in it matches the test.
+  return (MatchesFilter(full_name, positive.c_str()) &&
+          !MatchesFilter(full_name, negative.c_str()));
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
+      result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode),
+      result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() {
+  return GetTypeId<Test>();
+}
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+AssertionResult HasOneFailure(const char* /* results_expr */,
+                              const char* /* type_expr */,
+                              const char* /* substr_expr */,
+                              const TestPartResultArray& results,
+                              TestPartResult::Type type,
+                              const string& substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure ?
+                        "1 fatal failure" :
+                        "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult& r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == NULL) {
+    return AssertionFailure() << "Expected: " << expected << " containing \""
+                              << substr << "\"\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker:: SingleFailureChecker(
+    const TestPartResultArray* results,
+    TestPartResult::Type type,
+    const string& substr)
+    : results_(results),
+      type_(type),
+      substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface* reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface* reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test cases.
+int UnitTestImpl::successful_test_case_count() const {
+  return CountIf(test_cases_, TestCasePassed);
+}
+
+// Gets the number of failed test cases.
+int UnitTestImpl::failed_test_case_count() const {
+  return CountIf(test_cases_, TestCaseFailed);
+}
+
+// Gets the number of all test cases.
+int UnitTestImpl::total_test_case_count() const {
+  return static_cast<int>(test_cases_.size());
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTestImpl::test_case_to_run_count() const {
+  return CountIf(test_cases_, ShouldRunTestCase);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_,
+                             &TestCase::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  (void)skip_count;
+  return "";
+}
+
+// Returns the current time in milliseconds.
+TimeInMillis GetTimeInMillis() {
+#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
+  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
+  // http://analogous.blogspot.com/2005/04/epoch.html
+  const TimeInMillis kJavaEpochToWinFileTimeDelta =
+    static_cast<TimeInMillis>(116444736UL) * 100000UL;
+  const DWORD kTenthMicrosInMilliSecond = 10000;
+
+  SYSTEMTIME now_systime;
+  FILETIME now_filetime;
+  ULARGE_INTEGER now_int64;
+  // TODO(kenton@google.com): Shouldn't this just use
+  //   GetSystemTimeAsFileTime()?
+  GetSystemTime(&now_systime);
+  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
+    now_int64.LowPart = now_filetime.dwLowDateTime;
+    now_int64.HighPart = now_filetime.dwHighDateTime;
+    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
+      kJavaEpochToWinFileTimeDelta;
+    return now_int64.QuadPart;
+  }
+  return 0;
+#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
+  __timeb64 now;
+
+# ifdef _MSC_VER
+
+  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
+  // (deprecated function) there.
+  // TODO(kenton@google.com): Use GetTickCount()?  Or use
+  //   SystemTimeToFileTime()
+#  pragma warning(push)          // Saves the current warning state.
+#  pragma warning(disable:4996)  // Temporarily disables warning 4996.
+  _ftime64(&now);
+#  pragma warning(pop)           // Restores the warning state.
+# else
+
+  _ftime64(&now);
+
+# endif  // _MSC_VER
+
+  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
+#elif GTEST_HAS_GETTIMEOFDAY_
+  struct timeval now;
+  gettimeofday(&now, NULL);
+  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
+#else
+# error "Don't know how to get the current time on your system."
+#endif
+}
+
+// Utilities
+
+// class String.
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
+  if (!ansi) return NULL;
+  const int length = strlen(ansi);
+  const int unicode_length =
+      MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                          NULL, 0);
+  WCHAR* unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                      unicode, unicode_length);
+  unicode[unicode_length] = 0;
+  return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+  if (!utf16_str) return NULL;
+  const int ansi_length =
+      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                          NULL, 0, NULL, NULL);
+  char* ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                      ansi, ansi_length, NULL, NULL);
+  ansi[ansi_length] = 0;
+  return ansi;
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings.  Returns true iff they have the same content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char * lhs, const char * rhs) {
+  if ( lhs == NULL ) return rhs == NULL;
+
+  if ( rhs == NULL ) return false;
+
+  return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+                                     Message* msg) {
+  for (size_t i = 0; i != length; ) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+      while (i != length && wstr[i] != L'\0')
+        i++;
+    } else {
+      *msg << '\0';
+      i++;
+    }
+  }
+}
+
+#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+}  // namespace internal
+
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+  // By default, we want there to be enough precision when printing
+  // a double to a Message.
+  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message& Message::operator <<(const wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message& Message::operator <<(wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::std::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+  return internal::StringStreamToString(ss_.get());
+}
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != NULL ?
+               new ::std::string(*other.message_) :
+               static_cast< ::std::string*>(NULL)) {
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != NULL)
+    negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() {
+  return AssertionResult(true);
+}
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() {
+  return AssertionResult(false);
+}
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+namespace internal {
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+AssertionResult EqFailure(const char* expected_expression,
+                          const char* actual_expression,
+                          const std::string& expected_value,
+                          const std::string& actual_value,
+                          bool ignoring_case) {
+  Message msg;
+  msg << "Value of: " << actual_expression;
+  if (actual_value != actual_expression) {
+    msg << "\n  Actual: " << actual_value;
+  }
+
+  msg << "\nExpected: " << expected_expression;
+  if (ignoring_case) {
+    msg << " (ignoring case)";
+  }
+  if (expected_value != expected_expression) {
+    msg << "\nWhich is: " << expected_value;
+  }
+
+  return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value) {
+  const char* actual_message = assertion_result.message();
+  Message msg;
+  msg << "Value of: " << expression_text
+      << "\n  Actual: " << actual_predicate_value;
+  if (actual_message[0] != '\0')
+    msg << " (" << actual_message << ")";
+  msg << "\nExpected: " << expected_predicate_value;
+  return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char* expr1,
+                                     const char* expr2,
+                                     const char* abs_error_expr,
+                                     double val1,
+                                     double val2,
+                                     double abs_error) {
+  const double diff = fabs(val1 - val2);
+  if (diff <= abs_error) return AssertionSuccess();
+
+  // TODO(wan): do not print the value of an expression if it's
+  // already a literal.
+  return AssertionFailure()
+      << "The difference between " << expr1 << " and " << expr2
+      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
+      << expr1 << " evaluates to " << val1 << ",\n"
+      << expr2 << " evaluates to " << val2 << ", and\n"
+      << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char* expr1,
+                                const char* expr2,
+                                RawType val1,
+                                RawType val2) {
+  // Returns success if val1 is less than val2,
+  if (val1 < val2) {
+    return AssertionSuccess();
+  }
+
+  // or if val1 is almost equal to val2.
+  const FloatingPoint<RawType> lhs(val1), rhs(val2);
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  // Note that the above two checks will both fail if either val1 or
+  // val2 is NaN, as the IEEE floating-point standard requires that
+  // any predicate involving a NaN must return false.
+
+  ::std::stringstream val1_ss;
+  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val1;
+
+  ::std::stringstream val2_ss;
+  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val2;
+
+  return AssertionFailure()
+      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+      << StringStreamToString(&val2_ss);
+}
+
+}  // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char* expr1, const char* expr2,
+                        float val1, float val2) {
+  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                         double val1, double val2) {
+  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_EQ with int or enum
+// arguments.
+AssertionResult CmpHelperEQ(const char* expected_expression,
+                            const char* actual_expression,
+                            BiggestInt expected,
+                            BiggestInt actual) {
+  if (expected == actual) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   FormatForComparisonFailureMessage(expected, actual),
+                   FormatForComparisonFailureMessage(actual, expected),
+                   false);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
+// just to avoid copy-and-paste of similar code.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   BiggestInt val1, BiggestInt val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}
+
+// Implements the helper function for {ASSERT|EXPECT}_NE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(NE, !=)
+// Implements the helper function for {ASSERT|EXPECT}_LE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LE, <=)
+// Implements the helper function for {ASSERT|EXPECT}_LT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LT, < )
+// Implements the helper function for {ASSERT|EXPECT}_GE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GE, >=)
+// Implements the helper function for {ASSERT|EXPECT}_GT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GT, > )
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                               const char* actual_expression,
+                               const char* expected,
+                               const char* actual) {
+  if (String::CStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
+                                   const char* actual_expression,
+                                   const char* expected,
+                                   const char* actual) {
+  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const char* s1,
+                               const char* s2) {
+  if (!String::CStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                              << s2_expression << "), actual: \""
+                              << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                   const char* s2_expression,
+                                   const char* s1,
+                                   const char* s2) {
+  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+        << "Expected: (" << s1_expression << ") != ("
+        << s2_expression << ") (ignoring case), actual: \""
+        << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+}  // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true iff needle is a
+// substring of haystack.  NULL is considered a substring of itself
+// only.
+
+bool IsSubstringPred(const char* needle, const char* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return strstr(haystack, needle) != NULL;
+}
+
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return wcsstr(haystack, needle) != NULL;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType& needle,
+                     const StringType& haystack) {
+  return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(
+    bool expected_to_be_substring,
+    const char* needle_expr, const char* haystack_expr,
+    const StringType& needle, const StringType& haystack) {
+  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(needle[0]) > 1;
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+      << "Value of: " << needle_expr << "\n"
+      << "  Actual: " << begin_string_quote << needle << "\"\n"
+      << "Expected: " << (expected_to_be_substring ? "" : "not ")
+      << "a substring of " << haystack_expr << "\n"
+      << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+}  // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char* expr,
+                                     const char* expected,
+                                     long hr) {  // NOLINT
+# if GTEST_OS_WINDOWS_MOBILE
+
+  // Windows CE doesn't support FormatMessage.
+  const char error_text[] = "";
+
+# else
+
+  // Looks up the human-readable system message for the HRESULT code
+  // and since we're not passing any params to FormatMessage, we don't
+  // want inserts expanded.
+  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
+                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kBufSize = 4096;
+  // Gets the system's human readable message string for this HRESULT.
+  char error_text[kBufSize] = { '\0' };
+  DWORD message_length = ::FormatMessageA(kFlags,
+                                          0,  // no source, we're asking system
+                                          hr,  // the error
+                                          0,  // no line width restrictions
+                                          error_text,  // output buffer
+                                          kBufSize,  // buf size
+                                          NULL);  // no arguments for inserts
+  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
+  for (; message_length && IsSpace(error_text[message_length - 1]);
+          --message_length) {
+    error_text[message_length - 1] = '\0';
+  }
+
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+  const std::string error_hex("0x" + String::FormatHexInt(hr));
+  return ::testing::AssertionFailure()
+      << "Expected: " << expr << " " << expected << ".\n"
+      << "  Actual: " << error_hex << " " << error_text << "\n";
+}
+
+}  // namespace
+
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
+  if (SUCCEEDED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
+  if (FAILED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length   Encoding
+//   0 -  7 bits       0xxxxxxx
+//   8 - 11 bits       110xxxxx 10xxxxxx
+//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
+//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern.  Returns the n
+// lowest bits.  As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline UInt32 ChopLowBits(UInt32* bits, int n) {
+  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
+  *bits >>= n;
+  return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(UInt32 code_point) {
+  if (code_point > kMaxCodePoint4) {
+    return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")";
+  }
+
+  char str[5];  // Big enough for the largest valid code point.
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else {  // code_point <= kMaxCodePoint4
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
+  }
+  return str;
+}
+
+// The following two functions only make sense if the the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  return sizeof(wchar_t) == 2 &&
+      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                    wchar_t second) {
+  const UInt32 mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2) ?
+      (((first & mask) << 10) | (second & mask)) + 0x10000 :
+      // This function should not be called when the condition is
+      // false, but we provide a sensible default in case it is.
+      static_cast<UInt32>(first);
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1)
+    num_chars = static_cast<int>(wcslen(str));
+
+  ::std::stringstream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    UInt32 unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
+                                                                 str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<UInt32>(str[i]);
+    }
+
+    stream << CodePointToUtf8(unicode_code_point);
+  }
+  return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to an std::string using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+  if (wide_c_str == NULL)  return "(null)";
+
+  return internal::WideStringToUtf8(wide_c_str, -1);
+}
+
+// Compares two wide C strings.  Returns true iff they have the same
+// content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+  return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                               const char* actual_expression,
+                               const wchar_t* expected,
+                               const wchar_t* actual) {
+  if (String::WideCStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const wchar_t* s1,
+                               const wchar_t* s2) {
+  if (!String::WideCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  }
+
+  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                            << s2_expression << "), actual: "
+                            << PrintToString(s1)
+                            << " vs " << PrintToString(s2);
+}
+
+// Compares two C strings, ignoring case.  Returns true iff they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s).  A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+  if (lhs == NULL)
+    return rhs == NULL;
+  if (rhs == NULL)
+    return false;
+  return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                              const wchar_t* rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+#if GTEST_OS_WINDOWS
+  return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+  return wcscasecmp(lhs, rhs) == 0;
+#else
+  // Android, Mac OS X and Cygwin don't define wcscasecmp.
+  // Other unknown OSes may not define it either.
+  wint_t left, right;
+  do {
+    left = towlower(*lhs++);
+    right = towlower(*rhs++);
+  } while (left && left == right);
+  return left == right;
+#endif  // OS selector
+}
+
+// Returns true iff str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(
+    const std::string& str, const std::string& suffix) {
+  const size_t str_len = str.length();
+  const size_t suffix_len = suffix.length();
+  return (str_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+                                      suffix.c_str());
+}
+
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  std::stringstream ss;
+  ss << std::hex << std::uppercase << value;
+  return ss.str();
+}
+
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+     << static_cast<unsigned int>(value);
+  return ss.str();
+}
+
+// Converts the buffer in a stringstream to an std::string, converting NUL
+// bytes to "\\0" along the way.
+std::string StringStreamToString(::std::stringstream* ss) {
+  const ::std::string& str = ss->str();
+  const char* const start = str.c_str();
+  const char* const end = start + str.length();
+
+  std::string result;
+  result.reserve(2 * (end - start));
+  for (const char* ch = start; ch != end; ++ch) {
+    if (*ch == '\0') {
+      result += "\\0";  // Replaces NUL with "\\0";
+    } else {
+      result += *ch;
+    }
+  }
+
+  return result;
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+std::string AppendUserMessage(const std::string& gtest_msg,
+                              const Message& user_msg) {
+  // Appends the user message if it's non-empty.
+  const std::string user_msg_string = user_msg.GetString();
+  if (user_msg_string.empty()) {
+    return gtest_msg;
+  }
+
+  return gtest_msg + "\n" + user_msg_string;
+}
+
+}  // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+    : death_test_count_(0),
+      elapsed_time_(0) {
+}
+
+// D'tor.
+TestResult::~TestResult() {
+}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count())
+    internal::posix::Abort();
+  return test_part_results_.at(i);
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty& TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count())
+    internal::posix::Abort();
+  return test_properties_.at(i);
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() {
+  test_part_results_.clear();
+}
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+  test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const std::string& xml_element,
+                                const TestProperty& test_property) {
+  if (!ValidateTestProperty(xml_element, test_property)) {
+    return;
+  }
+  internal::MutexLock lock(&test_properites_mutex_);
+  const std::vector<TestProperty>::iterator property_with_matching_key =
+      std::find_if(test_properties_.begin(), test_properties_.end(),
+                   internal::TestPropertyKeyIs(test_property.key()));
+  if (property_with_matching_key == test_properties_.end()) {
+    test_properties_.push_back(test_property);
+    return;
+  }
+  property_with_matching_key->SetValue(test_property.value());
+}
+
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char* const kReservedTestSuitesAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "random_seed",
+  "tests",
+  "time",
+  "timestamp"
+};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char* const kReservedTestSuiteAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "tests",
+  "time"
+};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char* const kReservedTestCaseAttributes[] = {
+  "classname",
+  "name",
+  "status",
+  "time",
+  "type_param",
+  "value_param"
+};
+
+template <int kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+  return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string>& words) {
+  Message word_list;
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (i > 0 && words.size() > 2) {
+      word_list << ", ";
+    }
+    if (i == words.size() - 1) {
+      word_list << "and ";
+    }
+    word_list << "'" << words[i] << "'";
+  }
+  return word_list.GetString();
+}
+
+bool ValidateTestPropertyName(const std::string& property_name,
+                              const std::vector<std::string>& reserved_names) {
+  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+          reserved_names.end()) {
+    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+                  << " (" << FormatWordList(reserved_names)
+                  << " are reserved by " << GTEST_NAME_ << ")";
+    return false;
+  }
+  return true;
+}
+
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+                                      const TestProperty& test_property) {
+  return ValidateTestPropertyName(test_property.key(),
+                                  GetReservedAttributesForElement(xml_element));
+}
+
+// Clears the object.
+void TestResult::Clear() {
+  test_part_results_.clear();
+  test_properties_.clear();
+  death_test_count_ = 0;
+  elapsed_time_ = 0;
+}
+
+// Returns true iff the test failed.
+bool TestResult::Failed() const {
+  for (int i = 0; i < total_part_count(); ++i) {
+    if (GetTestPartResult(i).failed())
+      return true;
+  }
+  return false;
+}
+
+// Returns true iff the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult& result) {
+  return result.fatally_failed();
+}
+
+// Returns true iff the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true iff the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+  return result.nonfatally_failed();
+}
+
+// Returns true iff the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts.  This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+  return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+  return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the values of all Google Test flags.
+Test::Test()
+    : gtest_flag_saver_(new internal::GTestFlagSaver) {
+}
+
+// The d'tor restores the values of all Google Test flags.
+Test::~Test() {
+  delete gtest_flag_saver_;
+}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {
+}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, const std::string& value) {
+  UnitTest::GetInstance()->RecordProperty(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, int value) {
+  Message value_message;
+  value_message << value;
+  RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message) {
+  // This function is a friend of UnitTest and as such has access to
+  // AddTestPartResult.
+  UnitTest::GetInstance()->AddTestPartResult(
+      result_type,
+      NULL,  // No info about the source file where the exception occurred.
+      -1,    // We have no info on which line caused the exception.
+      message,
+      "");   // No stack trace, either.
+}
+
+}  // namespace internal
+
+// Google Test requires all tests in the same test case to use the same test
+// fixture class.  This function checks if the current test has the
+// same fixture class as the first test in the current test case.  If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  const TestCase* const test_case = impl->current_test_case();
+
+  // Info about the first test in the current test case.
+  const TestInfo* const first_test_info = test_case->test_info_list()[0];
+  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+  const char* const first_test_name = first_test_info->name();
+
+  // Info about the current test.
+  const TestInfo* const this_test_info = impl->current_test_info();
+  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+  const char* const this_test_name = this_test_info->name();
+
+  if (this_fixture_id != first_fixture_id) {
+    // Is the first test defined using TEST?
+    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+    // Is this test defined using TEST?
+    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+    if (first_is_TEST || this_is_TEST) {
+      // The user mixed TEST and TEST_F in this test case - we'll tell
+      // him/her how to fix it.
+
+      // Gets the name of the TEST and the name of the TEST_F.  Note
+      // that first_is_TEST and this_is_TEST cannot both be true, as
+      // the fixture IDs are different for the two tests.
+      const char* const TEST_name =
+          first_is_TEST ? first_test_name : this_test_name;
+      const char* const TEST_F_name =
+          first_is_TEST ? this_test_name : first_test_name;
+
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test case is\n"
+          << "illegal.  In test case " << this_test_info->test_case_name()
+          << ",\n"
+          << "test " << TEST_F_name << " is defined using TEST_F but\n"
+          << "test " << TEST_name << " is defined using TEST.  You probably\n"
+          << "want to change the TEST to TEST_F or move it to another test\n"
+          << "case.";
+    } else {
+      // The user defined two fixture classes with the same name in
+      // two namespaces - we'll tell him/her how to fix it.
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class.  However, in test case "
+          << this_test_info->test_case_name() << ",\n"
+          << "you defined test " << first_test_name
+          << " and test " << this_test_name << "\n"
+          << "using two different test fixture classes.  This can happen if\n"
+          << "the two classes are from different namespaces or translation\n"
+          << "units and have the same name.  You should probably rename one\n"
+          << "of the classes to put the tests into different test cases.";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test.  This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+                                              const char* location) {
+  Message message;
+  message << "SEH exception with code 0x" << std::setbase(16) <<
+    exception_code << std::setbase(10) << " thrown in " << location << ".";
+
+  return new std::string(message.GetString());
+}
+
+#endif  // GTEST_HAS_SEH
+
+namespace internal {
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static std::string FormatCxxExceptionMessage(const char* description,
+                                             const char* location) {
+  Message message;
+  if (description != NULL) {
+    message << "C++ exception with description \"" << description << "\"";
+  } else {
+    message << "Unknown C++ exception";
+  }
+  message << " thrown in " << location << ".";
+
+  return message.GetString();
+}
+
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result);
+
+GoogleTestFailureException::GoogleTestFailureException(
+    const TestPartResult& failure)
+    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception.  (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function.  Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+#if GTEST_HAS_SEH
+  __try {
+    return (object->*method)();
+  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
+      GetExceptionCode())) {
+    // We create the exception message on the heap because VC++ prohibits
+    // creation of objects with destructors on stack in functions using __try
+    // (see error C2712).
+    std::string* exception_message = FormatSehExceptionMessage(
+        GetExceptionCode(), location);
+    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+                                             *exception_message);
+    delete exception_message;
+    return static_cast<Result>(0);
+  }
+#else
+  (void)location;
+  return (object->*method)();
+#endif  // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+  // NOTE: The user code can affect the way in which Google Test handles
+  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+  // after the exception is caught and either report or re-throw the
+  // exception based on the flag's value:
+  //
+  // try {
+  //   // Perform the test method.
+  // } catch (...) {
+  //   if (GTEST_FLAG(catch_exceptions))
+  //     // Report the exception as failure.
+  //   else
+  //     throw;  // Re-throws the original exception.
+  // }
+  //
+  // However, the purpose of this flag is to allow the program to drop into
+  // the debugger when the exception is thrown. On most platforms, once the
+  // control enters the catch block, the exception origin information is
+  // lost and the debugger will stop the program at the point of the
+  // re-throw in this function -- instead of at the point of the original
+  // throw statement in the code under test.  For this reason, we perform
+  // the check early, sacrificing the ability to affect Google Test's
+  // exception handling in the method where the exception is thrown.
+  if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+    try {
+      return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
+      // This exception type can only be thrown by a failed Google
+      // Test assertion with the intention of letting another testing
+      // framework catch it.  Therefore we just re-throw it.
+      throw;
+    } catch (const std::exception& e) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(e.what(), location));
+    } catch (...) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(NULL, location));
+    }
+    return static_cast<Result>(0);
+#else
+    return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif  // GTEST_HAS_EXCEPTIONS
+  } else {
+    return (object->*method)();
+  }
+}
+
+}  // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+  if (!HasSameFixtureClass()) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+  // We will run the test only if SetUp() was successful.
+  if (!HasFatalFailure()) {
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        this, &Test::TestBody, "the test body");
+  }
+
+  // However, we want to clean up as much as possible.  Hence we will
+  // always call TearDown(), even if SetUp() or the test body has
+  // failed.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &Test::TearDown, "TearDown()");
+}
+
+// Returns true iff the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true iff the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->
+      HasNonfatalFailure();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+TestInfo::TestInfo(const std::string& a_test_case_name,
+                   const std::string& a_name,
+                   const char* a_type_param,
+                   const char* a_value_param,
+                   internal::TypeId fixture_class_id,
+                   internal::TestFactoryBase* factory)
+    : test_case_name_(a_test_case_name),
+      name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
+      fixture_class_id_(fixture_class_id),
+      should_run_(false),
+      is_disabled_(false),
+      matches_filter_(false),
+      factory_(factory),
+      result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a value-parameterized test.
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory) {
+  TestInfo* const test_info =
+      new TestInfo(test_case_name, name, type_param, value_param,
+                   fixture_class_id, factory);
+  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+  return test_info;
+}
+
+#if GTEST_HAS_PARAM_TEST
+void ReportInvalidTestCaseType(const char* test_case_name,
+                               const char* file, int line) {
+  Message errors;
+  errors
+      << "Attempted redefinition of test case " << test_case_name << ".\n"
+      << "All tests in the same test case must use the same test fixture\n"
+      << "class.  However, in test case " << test_case_name << ", you tried\n"
+      << "to define a test using a fixture class different from the one\n"
+      << "used earlier. This can happen if the two fixture classes are\n"
+      << "from different namespaces and have the same name. You should\n"
+      << "probably rename one of the classes to put the tests into different\n"
+      << "test cases.";
+
+  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+          errors.GetString().c_str());
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+}  // namespace internal
+
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestCase class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+
+//Commenting out this class since its not used and wherefor produces warnings
+// class TestNameIs {
+// public:
+//  // Constructor.
+//  //
+//  // TestNameIs has NO default constructor.
+//  explicit TestNameIs(const char* name)
+//      : name_(name) {}
+//
+//  // Returns true iff the test name of test_info matches name_.
+//  bool operator()(const TestInfo * test_info) const {
+//    return test_info && test_info->name() == name_;
+//  }
+//
+// private:
+//  std::string name_;
+//};
+
+}  // namespace
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+#if GTEST_HAS_PARAM_TEST
+  if (!parameterized_tests_registered_) {
+    parameterized_test_registry_.RegisterTests();
+    parameterized_tests_registered_ = true;
+  }
+#endif
+}
+
+}  // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+  if (!should_run_) return;
+
+  // Tells UnitTest where to store test result.
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TimeInMillis start = internal::GetTimeInMillis();
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+
+  // Creates the test object.
+  Test* const test = internal::HandleExceptionsInMethodIfSupported(
+      factory_, &internal::TestFactoryBase::CreateTest,
+      "the test fixture's constructor");
+
+  // Runs the test only if the test object was created and its
+  // constructor didn't generate a fatal failure.
+  if ((test != NULL) && !Test::HasFatalFailure()) {
+    // This doesn't throw as all user code that can throw are wrapped into
+    // exception handling code.
+    test->Run();
+  }
+
+  // Deletes the test object.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      test, &Test::DeleteSelf_, "the test fixture's destructor");
+
+  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+
+  // Tells UnitTest to stop associating assertion results to this
+  // test.
+  impl->set_current_test_info(NULL);
+}
+
+// class TestCase
+
+// Gets the number of successful tests in this test case.
+int TestCase::successful_test_count() const {
+  return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of failed tests in this test case.
+int TestCase::failed_test_count() const {
+  return CountIf(test_info_list_, TestFailed);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestCase::reportable_disabled_test_count() const {
+  return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test case.
+int TestCase::disabled_test_count() const {
+  return CountIf(test_info_list_, TestDisabled);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int TestCase::reportable_test_count() const {
+  return CountIf(test_info_list_, TestReportable);
+}
+
+// Get the number of tests in this test case that should run.
+int TestCase::test_to_run_count() const {
+  return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestCase::total_test_count() const {
+  return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestCase with the given name.
+//
+// Arguments:
+//
+//   name:         name of the test case
+//   a_type_param: the name of the test case's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test case.
+//   set_up_tc:    pointer to the function that sets up the test case
+//   tear_down_tc: pointer to the function that tears down the test case
+TestCase::TestCase(const char* a_name, const char* a_type_param,
+                   Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc)
+    : name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      set_up_tc_(set_up_tc),
+      tear_down_tc_(tear_down_tc),
+      should_run_(false),
+      elapsed_time_(0) {
+}
+
+// Destructor of TestCase.
+TestCase::~TestCase() {
+  // Deletes every Test in the collection.
+  ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo* TestCase::GetTestInfo(int i) const {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo* TestCase::GetMutableTestInfo(int i) {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Adds a test to this test case.  Will delete the test upon
+// destruction of the TestCase object.
+void TestCase::AddTestInfo(TestInfo * test_info) {
+  test_info_list_.push_back(test_info);
+  test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestCase.
+void TestCase::Run() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_case(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  repeater->OnTestCaseStart(*this);
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
+
+  const internal::TimeInMillis start = internal::GetTimeInMillis();
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Run();
+  }
+  elapsed_time_ = internal::GetTimeInMillis() - start;
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
+
+  repeater->OnTestCaseEnd(*this);
+  impl->set_current_test_case(NULL);
+}
+
+// Clears the results of all tests in this test case.
+void TestCase::ClearResult() {
+  ad_hoc_test_result_.Clear();
+  ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test case.
+void TestCase::ShuffleTests(internal::Random* random) {
+  Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestCase::UnshuffleTests() {
+  for (size_t i = 0; i < test_indices_.size(); i++) {
+    test_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Formats a countable noun.  Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static std::string FormatCountableNoun(int count,
+                                       const char * singular_form,
+                                       const char * plural_form) {
+  return internal::StreamableToString(count) + " " +
+      (count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static std::string FormatTestCount(int test_count) {
+  return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test cases.
+static std::string FormatTestCaseCount(int test_case_count) {
+  return FormatCountableNoun(test_case_count, "test case", "test cases");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation.  Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+  switch (type) {
+    case TestPartResult::kSuccess:
+      return "Success";
+
+    case TestPartResult::kNonFatalFailure:
+    case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+      return "error: ";
+#else
+      return "Failure\n";
+#endif
+    default:
+      return "Unknown result type";
+  }
+}
+
+namespace internal {
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result) {
+  return (Message()
+          << internal::FormatFileLocation(test_part_result.file_name(),
+                                          test_part_result.line_number())
+          << " " << TestPartResultTypeToString(test_part_result.type())
+          << test_part_result.message()).GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+  const std::string& result =
+      PrintTestPartResultToString(test_part_result);
+  printf("%s\n", result.c_str());
+  fflush(stdout);
+  // If the test program runs in Visual Studio or a debugger, the
+  // following statements add the test part result message to the Output
+  // window such that the user can double-click on it to jump to the
+  // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  // We don't call OutputDebugString*() on Windows Mobile, as printing
+  // to stdout is done by OutputDebugString() there already - we don't
+  // want the same message printed twice.
+  ::OutputDebugStringA(result.c_str());
+  ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+
+enum GTestColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW
+};
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns the character attribute for the given color.
+WORD GetColorAttribute(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:    return FOREGROUND_RED;
+    case COLOR_GREEN:  return FOREGROUND_GREEN;
+    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:           return 0;
+  }
+}
+
+#else
+
+// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// an invalid input.
+const char* GetAnsiColorCode(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:     return "1";
+    case COLOR_GREEN:   return "2";
+    case COLOR_YELLOW:  return "3";
+    default:            return NULL;
+  };
+}
+
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true iff Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+  const char* const gtest_color = GTEST_FLAG(color).c_str();
+
+  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS
+    // On Windows the TERM variable is usually not set, but the
+    // console there does support colors.
+    return stdout_is_tty;
+#else
+    // On non-Windows platforms, we rely on the TERM variable.
+    const char* const term = posix::GetEnv("TERM");
+    const bool term_supports_color =
+        String::CStringEquals(term, "xterm") ||
+        String::CStringEquals(term, "xterm-color") ||
+        String::CStringEquals(term, "xterm-256color") ||
+        String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "linux") ||
+        String::CStringEquals(term, "cygwin");
+    return stdout_is_tty && term_supports_color;
+#endif  // GTEST_OS_WINDOWS
+  }
+
+  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+      String::CStringEquals(gtest_color, "1");
+  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
+  // value is neither one of these nor "auto", we treat it as "no" to
+  // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || GTEST_OS_IOS
+  const bool use_color = false;
+#else
+  static const bool in_color_mode =
+      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
+  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
+
+  if (!use_color) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle,
+                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  printf("\033[0;3%sm", GetAnsiColorCode(color));
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  va_end(args);
+}
+
+// Text printed in Google Test's text output and --gunit_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
+void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+  const char* const type_param = test_info.type_param();
+  const char* const value_param = test_info.value_param();
+
+  if (type_param != NULL || value_param != NULL) {
+    printf(", where ");
+    if (type_param != NULL) {
+      printf("%s = %s", kTypeParamLabel, type_param);
+      if (value_param != NULL)
+        printf(" and ");
+    }
+    if (value_param != NULL) {
+      printf("%s = %s", kValueParamLabel, value_param);
+    }
+  }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+  PrettyUnitTestResultPrinter() {}
+  static void PrintTestName(const char * test_case, const char * test) {
+    printf("%s.%s", test_case, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+
+ private:
+  static void PrintFailedTests(const UnitTest& unit_test);
+};
+
+  // Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+    const UnitTest& unit_test, int iteration) {
+  if (GTEST_FLAG(repeat) != 1)
+    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+  const char* const filter = GTEST_FLAG(filter).c_str();
+
+  // Prints the filter if it's not *.  This reminds the user that some
+  // tests may be skipped.
+  if (!String::CStringEquals(filter, kUniversalFilter)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+  }
+
+  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: This is test shard %d of %s.\n",
+                  static_cast<int>(shard_index) + 1,
+                  internal::posix::GetEnv(kTestTotalShards));
+  }
+
+  if (GTEST_FLAG(shuffle)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: Randomizing tests' orders with a seed of %d .\n",
+                  unit_test.random_seed());
+  }
+
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("Running %s from %s.\n",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment set-up.\n");
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_case.name());
+  if (test_case.type_param() == NULL) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  // If the test part succeeded, we don't need to do anything.
+  if (result.type() == TestPartResult::kSuccess)
+    return;
+
+  // Print failure message from the assertion (e.g. expected this and got that).
+  PrintTestPartResult(result);
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Passed()) {
+    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+  } else {
+    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  }
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  if (test_info.result()->Failed())
+    PrintFullTestCommentIfPresent(test_info);
+
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms)\n", internal::StreamableToString(
+           test_info.result()->elapsed_time()).c_str());
+  } else {
+    printf("\n");
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n",
+         counts.c_str(), test_case.name(),
+         internal::StreamableToString(test_case.elapsed_time()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment tear-down\n");
+  fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+  const int failed_test_count = unit_test.failed_test_count();
+  if (failed_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    const TestCase& test_case = *unit_test.GetTestCase(i);
+    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_case.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_case.GetTestInfo(j);
+      if (!test_info.should_run() || test_info.result()->Passed()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s.%s", test_case.name(), test_info.name());
+      PrintFullTestCommentIfPresent(test_info);
+      printf("\n");
+    }
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                     int /*iteration*/) {
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  int num_failures = unit_test.failed_test_count();
+  if (!unit_test.Passed()) {
+    const int failed_test_count = unit_test.failed_test_count();
+    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
+    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+    PrintFailedTests(unit_test);
+    printf("\n%2d FAILED %s\n", num_failures,
+                        num_failures == 1 ? "TEST" : "TESTS");
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (!num_failures) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(COLOR_YELLOW,
+                  "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled,
+                  num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+  TestEventRepeater() : forwarding_enabled_(true) {}
+  virtual ~TestEventRepeater();
+  void Append(TestEventListener *listener);
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled() const { return forwarding_enabled_; }
+  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+  virtual void OnTestProgramStart(const UnitTest& unit_test);
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& unit_test);
+
+ private:
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled_;
+  // The list of listeners that receive events.
+  std::vector<TestEventListener*> listeners_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+};
+
+TestEventRepeater::~TestEventRepeater() {
+  ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener *listener) {
+  listeners_.push_back(listener);
+}
+
+// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
+TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+  for (size_t i = 0; i < listeners_.size(); ++i) {
+    if (listeners_[i] == listener) {
+      listeners_.erase(listeners_.begin() + i);
+      return listener;
+    }
+  }
+
+  return NULL;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (size_t i = 0; i < listeners_.size(); i++) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+                                             int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = 0; i < listeners_.size(); i++) {
+      listeners_[i]->OnTestIterationStart(unit_test, iteration);
+    }
+  }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+                                           int iteration) {
+  if (forwarding_enabled_) {
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
+      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
+    }
+  }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit XmlUnitTestResultPrinter(const char* output_file);
+
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+
+ private:
+  // Is c a whitespace character that is normalized to a space character
+  // when it appears in an XML attribute value?
+  static bool IsNormalizableWhitespace(char c) {
+    return c == 0x9 || c == 0xA || c == 0xD;
+  }
+
+  // May c appear in a well-formed XML document?
+  static bool IsValidXmlCharacter(char c) {
+    return IsNormalizableWhitespace(c) || c >= 0x20;
+  }
+
+  // Returns an XML-escaped copy of the input string str.  If
+  // is_attribute is true, the text is meant to appear as an attribute
+  // value, and normalizable whitespace is preserved by replacing it
+  // with character references.
+  static std::string EscapeXml(const std::string& str, bool is_attribute);
+
+  // Returns the given string with all characters invalid in XML removed.
+  static std::string RemoveInvalidXmlCharacters(const std::string& str);
+
+  // Convenience wrapper around EscapeXml when str is an attribute value.
+  static std::string EscapeXmlAttribute(const std::string& str) {
+    return EscapeXml(str, true);
+  }
+
+  // Convenience wrapper around EscapeXml when str is not an attribute value.
+  static std::string EscapeXmlText(const char* str) {
+    return EscapeXml(str, false);
+  }
+
+  // Verifies that the given attribute belongs to the given element and
+  // streams the attribute as XML.
+  static void OutputXmlAttribute(std::ostream* stream,
+                                 const std::string& element_name,
+                                 const std::string& name,
+                                 const std::string& value);
+
+  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+  // Streams an XML representation of a TestInfo object.
+  static void OutputXmlTestInfo(::std::ostream* stream,
+                                const char* test_case_name,
+                                const TestInfo& test_info);
+
+  // Prints an XML representation of a TestCase object
+  static void PrintXmlTestCase(::std::ostream* stream,
+                               const TestCase& test_case);
+
+  // Prints an XML summary of unit_test to output stream out.
+  static void PrintXmlUnitTest(::std::ostream* stream,
+                               const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as space
+  // delimited XML attributes based on the property key="value" pairs.
+  // When the std::string is not empty, it includes a space at the beginning,
+  // to delimit this attribute from prior attributes.
+  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.c_str() == NULL || output_file_.empty()) {
+    fprintf(stderr, "XML output file may not be null\n");
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* xmlout = NULL;
+  FilePath output_file(output_file_);
+  FilePath output_dir(output_file.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    xmlout = posix::FOpen(output_file_.c_str(), "w");
+  }
+  if (xmlout == NULL) {
+    // TODO(wan): report the reason of the failure.
+    //
+    // We don't do it for now as:
+    //
+    //   1. There is no urgent need for it.
+    //   2. It's a bit involved to make the errno variable thread-safe on
+    //      all three operating systems (Linux, Windows, and Mac OS).
+    //   3. To interpret the meaning of errno in a thread-safe way,
+    //      we need the strerror_r() function, which is not available on
+    //      Windows.
+    fprintf(stderr,
+            "Unable to open file \"%s\"\n",
+            output_file_.c_str());
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+  std::stringstream stream;
+  PrintXmlUnitTest(&stream, unit_test);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str.  If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+// TODO(wan): It might be nice to have a minimally invasive, human-readable
+// escaping scheme for invalid characters, rather than dropping them.
+std::string XmlUnitTestResultPrinter::EscapeXml(
+    const std::string& str, bool is_attribute) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '<':
+        m << "&lt;";
+        break;
+      case '>':
+        m << "&gt;";
+        break;
+      case '&':
+        m << "&amp;";
+        break;
+      case '\'':
+        if (is_attribute)
+          m << "&apos;";
+        else
+          m << '\'';
+        break;
+      case '"':
+        if (is_attribute)
+          m << "&quot;";
+        else
+          m << '"';
+        break;
+      default:
+        if (IsValidXmlCharacter(ch)) {
+          if (is_attribute && IsNormalizableWhitespace(ch))
+            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+              << ";";
+          else
+            m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+    const std::string& str) {
+  std::string output;
+  output.reserve(str.size());
+  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
+    if (IsValidXmlCharacter(*it))
+      output.push_back(*it);
+
+  return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
+//     <testcase name="test-name">     <-- corresponds to a TestInfo object
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//                                     <-- individual assertion failures
+//     </testcase>
+//   </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << ms/1000.0;
+  return ss.str();
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+  // Using non-reentrant version as localtime_r is not portable.
+  time_t seconds = static_cast<time_t>(ms / 1000);
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4996)  // Temporarily disables warning 4996
+                                // (function or variable may be unsafe).
+  const struct tm* const time_struct = localtime(&seconds);  // NOLINT
+# pragma warning(pop)           // Restores the warning state again.
+#else
+  const struct tm* const time_struct = localtime(&seconds);  // NOLINT
+#endif
+  if (time_struct == NULL)
+    return "";  // Invalid ms value
+
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct->tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct->tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct->tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct->tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct->tm_min) + ":" +
+      String::FormatIntWidth2(time_struct->tm_sec);
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+                                                     const char* data) {
+  const char* segment = data;
+  *stream << "<![CDATA[";
+  for (;;) {
+    const char* const next_segment = strstr(segment, "]]>");
+    if (next_segment != NULL) {
+      stream->write(
+          segment, static_cast<std::streamsize>(next_segment - segment));
+      *stream << "]]>]]&gt;<![CDATA[";
+      segment = next_segment + strlen("]]>");
+    } else {
+      *stream << segment;
+      break;
+    }
+  }
+  *stream << "]]>";
+}
+
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Attribute " << name << " is not allowed for element <" << element_name
+      << ">.";
+
+  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
+// Prints an XML representation of a TestInfo object.
+// TODO(wan): There is also value in printing properties with the plain printer.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+                                                 const char* test_case_name,
+                                                 const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestcase = "testcase";
+
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
+
+  if (test_info.value_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "value_param",
+                       test_info.value_param());
+  }
+  if (test_info.type_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
+  }
+
+  OutputXmlAttribute(stream, kTestcase, "status",
+                     test_info.should_run() ? "run" : "notrun");
+  OutputXmlAttribute(stream, kTestcase, "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
+  *stream << TestPropertiesAsXmlAttributes(result);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      if (++failures == 1) {
+        *stream << ">\n";
+      }
+      const string location = internal::FormatCompilerIndependentFileLocation(
+          part.file_name(), part.line_number());
+      const string summary = location + "\n" + part.summary();
+      *stream << "      <failure message=\""
+              << EscapeXmlAttribute(summary.c_str())
+              << "\" type=\"\">";
+      const string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</failure>\n";
+    }
+  }
+
+  if (failures == 0)
+    *stream << " />\n";
+  else
+    *stream << "    </testcase>\n";
+}
+
+// Prints an XML representation of a TestCase object
+void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
+                                                const TestCase& test_case) {
+  const std::string kTestsuite = "testsuite";
+  *stream << "  <" << kTestsuite;
+  OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
+  OutputXmlAttribute(stream, kTestsuite, "tests",
+                     StreamableToString(test_case.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "failures",
+                     StreamableToString(test_case.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuite, "disabled",
+      StreamableToString(test_case.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+  OutputXmlAttribute(stream, kTestsuite, "time",
+                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
+  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
+          << ">\n";
+
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    if (test_case.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+  }
+  *stream << "  </" << kTestsuite << ">\n";
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
+                                                const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(unit_test.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "failures",
+                     StreamableToString(unit_test.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "disabled",
+      StreamableToString(unit_test.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(
+      stream, kTestsuites, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+
+  if (GTEST_FLAG(shuffle)) {
+    OutputXmlAttribute(stream, kTestsuites, "random_seed",
+                       StreamableToString(unit_test.random_seed()));
+  }
+
+  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
+      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+    const TestResult& result) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << " " << property.key() << "="
+        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End XmlUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+string StreamingListener::UrlEncode(const char* str) {
+  string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = NULL;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message = message.GetString();
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
+
+// class OsStackTraceGetter
+
+// Returns the current OS stack trace as an std::string.  Parameters:
+//
+//   max_depth  - the maximum number of stack frames to be included
+//                in the trace.
+//   skip_count - the number of top frames to be skipped; doesn't count
+//                against max_depth.
+//
+string OsStackTraceGetter::CurrentStackTrace(int /* max_depth */,
+                                             int /* skip_count */)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  return "";
+}
+
+void OsStackTraceGetter::UponLeavingGTest()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+}
+
+const char* const
+OsStackTraceGetter::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+      : premature_exit_filepath_(premature_exit_filepath) {
+    // If a path to the premature-exit file is specified...
+    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
+      // create the file with a single "0" character in it.  I/O
+      // errors are ignored as there's nothing better we can do and we
+      // don't want to fail the test because of this.
+      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      fwrite("0", 1, 1, pfile);
+      fclose(pfile);
+    }
+  }
+
+  ~ScopedPrematureExitFile() {
+    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
+      remove(premature_exit_filepath_);
+    }
+  }
+
+ private:
+  const char* const premature_exit_filepath_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+};
+
+}  // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+    : repeater_(new internal::TestEventRepeater()),
+      default_result_printer_(NULL),
+      default_xml_generator_(NULL) {
+}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output.  Can be removed from the listeners list to shut down default
+// console output.  Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener* listener) {
+  repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it.  It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+  if (listener == default_result_printer_)
+    default_result_printer_ = NULL;
+  else if (listener == default_xml_generator_)
+    default_xml_generator_ = NULL;
+  return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+  if (default_result_printer_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_result_printer_);
+    default_result_printer_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Sets the default_xml_generator attribute to the provided listener.  The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+  if (default_xml_generator_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_xml_generator_);
+    default_xml_generator_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+  return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+  repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object.  The first time this method is
+// called, a UnitTest object is constructed and returned.  Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest* UnitTest::GetInstance() {
+  // When compiled with MSVC 7.1 in optimized mode, destroying the
+  // UnitTest object upon exiting the program messes up the exit code,
+  // causing successful tests to appear failed.  We have to use a
+  // different implementation in this case to bypass the compiler bug.
+  // This implementation makes the compiler happy, at the cost of
+  // leaking the UnitTest object.
+
+  // CodeGear C++Builder insists on a public destructor for the
+  // default implementation.  Use this implementation to keep good OO
+  // design with private destructor.
+
+#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+  static UnitTest* const instance = new UnitTest;
+  return instance;
+#else
+  static UnitTest instance;
+  return &instance;
+#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+}
+
+// Gets the number of successful test cases.
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_case_count();
+}
+
+// Gets the number of failed test cases.
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_case_count();
+}
+
+// Gets the number of all test cases.
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_case_count();
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_case_to_run_count();
+}
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+  return impl()->successful_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+  return impl()->reportable_disabled_test_count();
+}
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+  return impl()->disabled_test_count();
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+  return impl()->reportable_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+    return impl()->start_timestamp();
+}
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+  return impl()->elapsed_time();
+}
+
+// Returns true iff the unit test passed (i.e. all test cases passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true iff the unit test failed (i.e. some test case failed
+// or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+const TestCase* UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test cases.
+const TestResult& UnitTest::ad_hoc_test_result() const {
+  return *impl()->ad_hoc_test_result();
+}
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+TestCase* UnitTest::GetMutableTestCase(int i) {
+  return impl()->GetMutableTestCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners& UnitTest::listeners() {
+  return *impl()->listeners();
+}
+
+// Registers and returns a global test environment.  When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered.  After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment* UnitTest::AddEnvironment(Environment* env) {
+  if (env == NULL) {
+    return NULL;
+  }
+
+  impl_->environments().push_back(env);
+  return env;
+}
+
+// Adds a TestPartResult to the current TestResult object.  All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results.  The user code should use the
+// assertion macros instead of calling this directly.
+void UnitTest::AddTestPartResult(
+    TestPartResult::Type result_type,
+    const char* file_name,
+    int line_number,
+    const std::string& message,
+    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+  Message msg;
+  msg << message;
+
+  internal::MutexLock lock(&mutex_);
+  if (impl_->gtest_trace_stack().size() > 0) {
+    msg << "\n" << GTEST_NAME_ << " trace:";
+
+    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
+         i > 0; --i) {
+      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
+          << " " << trace.message;
+    }
+  }
+
+  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
+    msg << internal::kStackTraceMarker << os_stack_trace;
+  }
+
+  const TestPartResult result =
+    TestPartResult(result_type, file_name, line_number,
+                   msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->
+      ReportTestPartResult(result);
+
+  if (result_type != TestPartResult::kSuccess) {
+    // gtest_break_on_failure takes precedence over
+    // gtest_throw_on_failure.  This allows a user to set the latter
+    // in the code (perhaps in order to use Google Test assertions
+    // with another testing framework) and specify the former on the
+    // command line for debugging.
+    if (GTEST_FLAG(break_on_failure)) {
+#if GTEST_OS_WINDOWS
+      // Using DebugBreak on Windows allows gtest to still break into a debugger
+      // when a failure happens and both the --gtest_break_on_failure and
+      // the --gtest_catch_exceptions flags are specified.
+      DebugBreak();
+#else
+      // Dereference NULL through a volatile pointer to prevent the compiler
+      // from removing. We use this rather than abort() or __builtin_trap() for
+      // portability: Symbian doesn't implement abort() well, and some debuggers
+      // don't correctly trap abort().
+      *static_cast<volatile int*>(NULL) = 1;
+#endif  // GTEST_OS_WINDOWS
+    } else if (GTEST_FLAG(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+      throw internal::GoogleTestFailureException(result);
+#else
+      // We cannot call abort() as it generates a pop-up in debug mode
+      // that cannot be suppressed in VC 7.1 or below.
+      exit(1);
+#endif
+    }
+  }
+}
+
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+// from SetUpTestCase or TearDownTestCase, or to the global property set
+// when invoked elsewhere.  If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string& key,
+                              const std::string& value) {
+  impl_->RecordProperty(TestProperty(key, value));
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+  const bool in_death_test_child_process =
+      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+
+  // Google Test implements this protocol for catching that a test
+  // program exits before returning control to Google Test:
+  //
+  //   1. Upon start, Google Test creates a file whose absolute path
+  //      is specified by the environment variable
+  //      TEST_PREMATURE_EXIT_FILE.
+  //   2. When Google Test has finished its work, it deletes the file.
+  //
+  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+  // running a Google-Test-based test program and check the existence
+  // of the file at the end of the test execution to see if it has
+  // exited prematurely.
+
+  // If we are in the child process of a death test, don't
+  // create/delete the premature exit file, as doing so is unnecessary
+  // and will confuse the parent process.  Otherwise, create/delete
+  // the file upon entering/leaving this function.  If the program
+  // somehow exits before this function has a chance to return, the
+  // premature-exit file will be left undeleted, causing a test runner
+  // that understands the premature-exit-file protocol to report the
+  // test as having failed.
+  const internal::ScopedPrematureExitFile premature_exit_file(
+      in_death_test_child_process ?
+      NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
+  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
+  // used for the duration of the program.
+  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+
+#if GTEST_HAS_SEH
+  // Either the user wants Google Test to catch exceptions thrown by the
+  // tests or this is executing in the context of death test child
+  // process. In either case the user does not want to see pop-up dialogs
+  // about crashes - they are expected.
+  if (impl()->catch_exceptions() || in_death_test_child_process) {
+# if !GTEST_OS_WINDOWS_MOBILE
+    // SetErrorMode doesn't exist on CE.
+    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+# endif  // !GTEST_OS_WINDOWS_MOBILE
+
+# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+    // Death test children can be terminated with _abort().  On Windows,
+    // _abort() can show a dialog with a warning message.  This forces the
+    // abort message to go to stderr instead.
+    _set_error_mode(_OUT_TO_STDERR);
+# endif
+
+# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+    // In the debug version, Visual Studio pops up a separate dialog
+    // offering a choice to debug the aborted program. We need to suppress
+    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+    // executed. Google Test will notify the user of any unexpected
+    // failure via stderr.
+    //
+    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
+    // Users of prior VC versions shall suffer the agony and pain of
+    // clicking through the countless debug dialogs.
+    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
+    // debug mode when compiled with VC 7.1 or lower.
+    if (!GTEST_FLAG(break_on_failure))
+      _set_abort_behavior(
+          0x0,                                    // Clear the following flags:
+          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
+# endif
+  }
+#endif  // GTEST_HAS_SEH
+
+  return internal::HandleExceptionsInMethodIfSupported(
+      impl(),
+      &internal::UnitTestImpl::RunAllTests,
+      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char* UnitTest::original_working_dir() const {
+  return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestCase object for the test that's currently running,
+// or NULL if no test is running.
+const TestCase* UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_case();
+}
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+const TestInfo* UnitTest::current_test_info() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+#if GTEST_HAS_PARAM_TEST
+// Returns ParameterizedTestCaseRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+internal::ParameterizedTestCaseRegistry&
+    UnitTest::parameterized_test_registry()
+        GTEST_LOCK_EXCLUDED_(mutex_) {
+  return impl_->parameterized_test_registry();
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() {
+  impl_ = new internal::UnitTestImpl(this);
+}
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() {
+  delete impl_;
+}
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+void UnitTest::PopGTestTrace()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
+    : parent_(parent),
+#ifdef _MSC_VER
+# pragma warning(push)                    // Saves the current warning state.
+# pragma warning(disable:4355)            // Temporarily disables warning 4355
+                                         // (using this in initializer).
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+# pragma warning(pop)                     // Restores the warning state again.
+#else
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+#endif  // _MSC_VER
+      global_test_part_result_repoter_(
+          &default_global_test_part_result_reporter_),
+      per_thread_test_part_result_reporter_(
+          &default_per_thread_test_part_result_reporter_),
+#if GTEST_HAS_PARAM_TEST
+      parameterized_test_registry_(),
+      parameterized_tests_registered_(false),
+#endif  // GTEST_HAS_PARAM_TEST
+      last_death_test_case_(-1),
+      current_test_case_(NULL),
+      current_test_info_(NULL),
+      ad_hoc_test_result_(),
+      os_stack_trace_getter_(NULL),
+      post_flag_parse_init_performed_(false),
+      random_seed_(0),  // Will be overridden by the flag before first use.
+      random_(0),  // Will be reseeded before first use.
+      start_timestamp_(0),
+      elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+      death_test_factory_(new DefaultDeathTestFactory),
+#endif
+      // Will be overridden by the flag before first use.
+      catch_exceptions_(false) {
+  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+  // Deletes every TestCase.
+  ForEach(test_cases_, internal::Delete<TestCase>);
+
+  // Deletes every Environment.
+  ForEach(environments_, internal::Delete<Environment>);
+
+  delete os_stack_trace_getter_;
+}
+
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test case's ad_hoc_test_result when invoke
+// from SetUpTestCase/TearDownTestCase, or to the global property set
+// otherwise.  If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+  std::string xml_element;
+  TestResult* test_result;  // TestResult appropriate for property recording.
+
+  if (current_test_info_ != NULL) {
+    xml_element = "testcase";
+    test_result = &(current_test_info_->result_);
+  } else if (current_test_case_ != NULL) {
+    xml_element = "testsuite";
+    test_result = &(current_test_case_->ad_hoc_test_result_);
+  } else {
+    xml_element = "testsuites";
+    test_result = &ad_hoc_test_result_;
+  }
+  test_result->RecordProperty(xml_element, test_property);
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+  if (internal_run_death_test_flag_.get() != NULL)
+    listeners()->SuppressEventForwarding();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml") {
+    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format != "") {
+    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
+           output_format.c_str());
+    fflush(stdout);
+  }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in string form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+  const std::string& target = GTEST_FLAG(stream_result_to);
+  if (!target.empty()) {
+    const size_t pos = target.find(':');
+    if (pos != std::string::npos) {
+      listeners()->Append(new StreamingListener(target.substr(0, pos),
+                                                target.substr(pos+1)));
+    } else {
+      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
+             target.c_str());
+      fflush(stdout);
+    }
+  }
+}
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests.  Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+  // Ensures that this function does not execute more than once.
+  if (!post_flag_parse_init_performed_) {
+    post_flag_parse_init_performed_ = true;
+
+#if GTEST_HAS_DEATH_TEST
+    InitDeathTestSubprocessControlInfo();
+    SuppressTestEventsIfInSubprocess();
+#endif  // GTEST_HAS_DEATH_TEST
+
+    // Registers parameterized tests. This makes parameterized tests
+    // available to the UnitTest reflection API without running
+    // RUN_ALL_TESTS.
+    RegisterParameterizedTests();
+
+    // Configures listeners for XML output. This makes it possible for users
+    // to shut down the default XML output before invoking RUN_ALL_TESTS.
+    ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+    // Configures listeners for streaming test results to the specified server.
+    ConfigureStreamingOutput();
+#endif  // GTEST_CAN_STREAM_RESULTS_
+  }
+}
+
+// A predicate that checks the name of a TestCase against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestCaseNameIs is copyable.
+class TestCaseNameIs {
+ public:
+  // Constructor.
+  explicit TestCaseNameIs(const std::string& name)
+      : name_(name) {}
+
+  // Returns true iff the name of test_case matches name_.
+  bool operator()(const TestCase* test_case) const {
+    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
+  }
+
+ private:
+  std::string name_;
+};
+
+// Finds and returns a TestCase with the given name.  If one doesn't
+// exist, creates one and returns it.  It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+//   test_case_name: name of the test case
+//   type_param:     the name of the test case's type parameter, or NULL if
+//                   this is not a typed or a type-parameterized test case.
+//   set_up_tc:      pointer to the function that sets up the test case
+//   tear_down_tc:   pointer to the function that tears down the test case
+TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
+                                    const char* type_param,
+                                    Test::SetUpTestCaseFunc set_up_tc,
+                                    Test::TearDownTestCaseFunc tear_down_tc) {
+  // Can we find a TestCase with the given name?
+  const std::vector<TestCase*>::const_iterator test_case =
+      std::find_if(test_cases_.begin(), test_cases_.end(),
+                   TestCaseNameIs(test_case_name));
+
+  if (test_case != test_cases_.end())
+    return *test_case;
+
+  // No.  Let's create one.
+  TestCase* const new_test_case =
+      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
+
+  // Is this a death test case?
+  if (internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                               kDeathTestCaseFilter)) {
+    // Yes.  Inserts the test case after the last death test case
+    // defined so far.  This only works when the test cases haven't
+    // been shuffled.  Otherwise we may end up running a death test
+    // after a non-death test.
+    ++last_death_test_case_;
+    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
+                       new_test_case);
+  } else {
+    // No.  Appends to the end of the list.
+    test_cases_.push_back(new_test_case);
+  }
+
+  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
+  return new_test_case;
+}
+
+// Helpers for setting up / tearing down the given environment.  They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful.  If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+  // Makes sure InitGoogleTest() was called.
+  if (!GTestIsInitialized()) {
+    printf("%s",
+           "\nThis test program did NOT call ::testing::InitGoogleTest "
+           "before calling RUN_ALL_TESTS().  Please fix it.\n");
+    return false;
+  }
+
+  // Do not run any test if the --help flag was specified.
+  if (g_help_flag)
+    return true;
+
+  // Repeats the call to the post-flag parsing initialization in case the
+  // user didn't call InitGoogleTest.
+  PostFlagParsingInit();
+
+  // Even if sharding is not on, test runners may want to use the
+  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+  // protocol.
+  internal::WriteToShardStatusFileIfNeeded();
+
+  // True iff we are in a subprocess for running a thread-safe-style
+  // death test.
+  bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
+#endif  // GTEST_HAS_DEATH_TEST
+
+  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+                                        in_subprocess_for_death_test);
+
+  // Compares the full test names with the filter to decide which
+  // tests to run.
+  const bool has_tests_to_run = FilterTests(should_shard
+                                              ? HONOR_SHARDING_PROTOCOL
+                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+
+  // Lists the tests and exits if the --gtest_list_tests flag was specified.
+  if (GTEST_FLAG(list_tests)) {
+    // This must be called *after* FilterTests() has been called.
+    ListTestsMatchingFilter();
+    return true;
+  }
+
+  random_seed_ = GTEST_FLAG(shuffle) ?
+      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+
+  // True iff at least one test has failed.
+  bool failed = false;
+
+  TestEventListener* repeater = listeners()->repeater();
+
+  start_timestamp_ = GetTimeInMillis();
+  repeater->OnTestProgramStart(*parent_);
+
+  // How many times to repeat the tests?  We don't want to repeat them
+  // when we are inside the subprocess of a death test.
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  // Repeats forever if the repeat count is negative.
+  const bool forever = repeat < 0;
+  for (int i = 0; forever || i != repeat; i++) {
+    // We want to preserve failures generated by ad-hoc test
+    // assertions executed before RUN_ALL_TESTS().
+    ClearNonAdHocTestResult();
+
+    const TimeInMillis start = GetTimeInMillis();
+
+    // Shuffles test cases and tests if requested.
+    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+      random()->Reseed(random_seed_);
+      // This should be done before calling OnTestIterationStart(),
+      // such that a test event listener can see the actual test order
+      // in the event.
+      ShuffleTests();
+    }
+
+    // Tells the unit test event listeners that the tests are about to start.
+    repeater->OnTestIterationStart(*parent_, i);
+
+    // Runs each test case if there is at least one test to run.
+    if (has_tests_to_run) {
+      // Sets up all environments beforehand.
+      repeater->OnEnvironmentsSetUpStart(*parent_);
+      ForEach(environments_, SetUpEnvironment);
+      repeater->OnEnvironmentsSetUpEnd(*parent_);
+
+      // Runs the tests only if there was no fatal failure during global
+      // set-up.
+      if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_case_count();
+             test_index++) {
+          GetMutableTestCase(test_index)->Run();
+        }
+      }
+
+      // Tears down all environments in reverse order afterwards.
+      repeater->OnEnvironmentsTearDownStart(*parent_);
+      std::for_each(environments_.rbegin(), environments_.rend(),
+                    TearDownEnvironment);
+      repeater->OnEnvironmentsTearDownEnd(*parent_);
+    }
+
+    elapsed_time_ = GetTimeInMillis() - start;
+
+    // Tells the unit test event listener that the tests have just finished.
+    repeater->OnTestIterationEnd(*parent_, i);
+
+    // Gets the result and clears it.
+    if (!Passed()) {
+      failed = true;
+    }
+
+    // Restores the original test order after the iteration.  This
+    // allows the user to quickly repro a failure that happens in the
+    // N-th iteration without repeating the first (N - 1) iterations.
+    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+    // case the user somehow changes the value of the flag somewhere
+    // (it's always safe to unshuffle the tests).
+    UnshuffleTests();
+
+    if (GTEST_FLAG(shuffle)) {
+      // Picks a new random seed for each iteration.
+      random_seed_ = GetNextRandomSeed(random_seed_);
+    }
+  }
+
+  repeater->OnTestProgramEnd(*parent_);
+
+  return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != NULL) {
+    FILE* const file = posix::FOpen(test_shard_file, "w");
+    if (file == NULL) {
+      ColoredPrintf(COLOR_RED,
+                    "Could not write to the test shard status file \"%s\" "
+                    "specified by the %s environment variable.\n",
+                    test_shard_file, kTestShardStatusFile);
+      fflush(stdout);
+      exit(EXIT_FAILURE);
+    }
+    fclose(file);
+  }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char* total_shards_env,
+                 const char* shard_index_env,
+                 bool in_subprocess_for_death_test) {
+  if (in_subprocess_for_death_test) {
+    return false;
+  }
+
+  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+  if (total_shards == -1 && shard_index == -1) {
+    return false;
+  } else if (total_shards == -1 && shard_index != -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestShardIndex << " = " << shard_index
+      << ", but have left " << kTestTotalShards << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (total_shards != -1 && shard_index == -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestTotalShards << " = " << total_shards
+      << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (shard_index < 0 || shard_index >= total_shards) {
+    const Message msg = Message()
+      << "Invalid environment variables: we require 0 <= "
+      << kTestShardIndex << " < " << kTestTotalShards
+      << ", but you have " << kTestShardIndex << "=" << shard_index
+      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  }
+
+  return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
+  const char* str_val = posix::GetEnv(var);
+  if (str_val == NULL) {
+    return default_val;
+  }
+
+  Int32 result;
+  if (!ParseInt32(Message() << "The value of environment variable " << var,
+                  str_val, &result)) {
+    exit(EXIT_FAILURE);
+  }
+  return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+  return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestCase and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
+// Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
+  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
+
+  // num_runnable_tests are the number of tests that will
+  // run across all shards (i.e., match filter and are not disabled).
+  // num_selected_tests are the number of tests to be run on
+  // this shard.
+  int num_runnable_tests = 0;
+  int num_selected_tests = 0;
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    TestCase* const test_case = test_cases_[i];
+    const std::string &test_case_name = test_case->name();
+    test_case->set_should_run(false);
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      TestInfo* const test_info = test_case->test_info_list()[j];
+      const std::string test_name(test_info->name());
+      // A test is disabled if test case name or test name matches
+      // kDisableTestFilter.
+      const bool is_disabled =
+          internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                                   kDisableTestFilter) ||
+          internal::UnitTestOptions::MatchesFilter(test_name,
+                                                   kDisableTestFilter);
+      test_info->is_disabled_ = is_disabled;
+
+      const bool matches_filter =
+          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
+                                                       test_name);
+      test_info->matches_filter_ = matches_filter;
+
+      const bool is_runnable =
+          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          matches_filter;
+
+      const bool is_selected = is_runnable &&
+          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
+           ShouldRunTestOnShard(total_shards, shard_index,
+                                num_runnable_tests));
+
+      num_runnable_tests += is_runnable;
+      num_selected_tests += is_selected;
+
+      test_info->should_run_ = is_selected;
+      test_case->set_should_run(test_case->should_run() || is_selected);
+    }
+  }
+  return num_selected_tests;
+}
+
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n".  If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char* str, int max_length) {
+  if (str != NULL) {
+    for (int i = 0; *str != '\0'; ++str) {
+      if (i >= max_length) {
+        printf("...");
+        break;
+      }
+      if (*str == '\n') {
+        printf("\\n");
+        i += 2;
+      } else {
+        printf("%c", *str);
+        ++i;
+      }
+    }
+  }
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+  // Print at most this many characters for each type/value parameter.
+  const int kMaxParamLength = 250;
+
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    const TestCase* const test_case = test_cases_[i];
+    bool printed_test_case_name = false;
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      const TestInfo* const test_info =
+          test_case->test_info_list()[j];
+      if (test_info->matches_filter_) {
+        if (!printed_test_case_name) {
+          printed_test_case_name = true;
+          printf("%s.", test_case->name());
+          if (test_case->type_param() != NULL) {
+            printf("  # %s = ", kTypeParamLabel);
+            // We print the type parameter on a single line to make
+            // the output easy to parse by a program.
+            PrintOnOneLine(test_case->type_param(), kMaxParamLength);
+          }
+          printf("\n");
+        }
+        printf("  %s", test_info->name());
+        if (test_info->value_param() != NULL) {
+          printf("  # %s = ", kValueParamLabel);
+          // We print the value parameter on a single line to make the
+          // output easy to parse by a program.
+          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
+        }
+        printf("\n");
+      }
+    }
+  }
+  fflush(stdout);
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+    OsStackTraceGetterInterface* getter) {
+  if (os_stack_trace_getter_ != getter) {
+    delete os_stack_trace_getter_;
+    os_stack_trace_getter_ = getter;
+  }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == NULL) {
+    os_stack_trace_getter_ = new OsStackTraceGetter;
+  }
+
+  return os_stack_trace_getter_;
+}
+
+// Returns the TestResult for the test that's currently running, or
+// the TestResult for the ad hoc test if no test is running.
+TestResult* UnitTestImpl::current_test_result() {
+  return current_test_info_ ?
+      &(current_test_info_->result_) : &ad_hoc_test_result_;
+}
+
+// Shuffles all test cases, and the tests within each test case,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+  // Shuffles the death test cases.
+  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
+
+  // Shuffles the non-death test cases.
+  ShuffleRange(random(), last_death_test_case_ + 1,
+               static_cast<int>(test_cases_.size()), &test_case_indices_);
+
+  // Shuffles the tests inside each test case.
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    test_cases_[i]->ShuffleTests(random());
+  }
+}
+
+// Restores the test cases and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    // Unshuffles the tests in each test case.
+    test_cases_[i]->UnshuffleTests();
+    // Resets the index of each test case.
+    test_case_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
+                                            int skip_count) {
+  // We pass skip_count + 1 to skip this wrapper function in addition
+  // to what the user really wants to skip.
+  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+}
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+  // This condition is always false so AlwaysTrue() never actually throws,
+  // but it makes the compiler think that it may throw.
+  if (IsTrue(false))
+    throw ClassUniqueToAlwaysTrue();
+#endif  // GTEST_HAS_EXCEPTIONS
+  return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char* prefix, const char** pstr) {
+  const size_t prefix_len = strlen(prefix);
+  if (strncmp(*pstr, prefix, prefix_len) == 0) {
+    *pstr += prefix_len;
+    return true;
+  }
+  return false;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+const char* ParseFlagValue(const char* str,
+                           const char* flag,
+                           bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == NULL || flag == NULL) return NULL;
+
+  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return NULL;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char* str) {
+  return (SkipPrefix("--", &str) ||
+          SkipPrefix("-", &str) ||
+          SkipPrefix("/", &str)) &&
+         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text.  The following escape
+// sequences can be used in the string to control the text color:
+//
+//   @@    prints a single '@' character.
+//   @R    changes the color to red.
+//   @G    changes the color to green.
+//   @Y    changes the color to yellow.
+//   @D    changes to the default terminal text color.
+//
+// TODO(wan@google.com): Write tests for this once we add stdout
+// capturing to Google Test.
+static void PrintColorEncoded(const char* str) {
+  GTestColor color = COLOR_DEFAULT;  // The current color.
+
+  // Conceptually, we split the string into segments divided by escape
+  // sequences.  Then we print one segment at a time.  At the end of
+  // each iteration, the str pointer advances to the beginning of the
+  // next segment.
+  for (;;) {
+    const char* p = strchr(str, '@');
+    if (p == NULL) {
+      ColoredPrintf(color, "%s", str);
+      return;
+    }
+
+    ColoredPrintf(color, "%s", std::string(str, p).c_str());
+
+    const char ch = p[1];
+    str = p + 2;
+    if (ch == '@') {
+      ColoredPrintf(color, "@");
+    } else if (ch == 'D') {
+      color = COLOR_DEFAULT;
+    } else if (ch == 'R') {
+      color = COLOR_RED;
+    } else if (ch == 'G') {
+      color = COLOR_GREEN;
+    } else if (ch == 'Y') {
+      color = COLOR_YELLOW;
+    } else {
+      --str;
+    }
+  }
+}
+
+static const char kColorEncodedHelpMessage[] =
+"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
+"following command line flags to control its behavior:\n"
+"\n"
+"Test Selection:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
+"      List the names of all tests instead of running them. The name of\n"
+"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "[@G-@YNEGATIVE_PATTERNS]@D\n"
+"      Run only the tests whose name matches one of the positive patterns but\n"
+"      none of the negative patterns. '?' matches any single character; '*'\n"
+"      matches any substring; ':' separates two patterns.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
+"      Run all disabled tests too.\n"
+"\n"
+"Test Execution:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
+"      Run the tests repeatedly; use a negative count to repeat forever.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
+"      Randomize tests' orders on every iteration.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
+"      Random number seed to use for shuffling test orders (between 1 and\n"
+"      99999, or 0 to use a seed based on the current time).\n"
+"\n"
+"Test Output:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+"      Enable/disable colored output. The default is @Gauto@D.\n"
+"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
+"      Don't print the elapsed time of each test.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
+    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
+"      Generate an XML report in the given directory or with the given file\n"
+"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
+#if GTEST_CAN_STREAM_RESULTS_
+"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
+"      Stream test results to the given server.\n"
+#endif  // GTEST_CAN_STREAM_RESULTS_
+"\n"
+"Assertion Behavior:\n"
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+"      Set the default death test style.\n"
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
+"      Turn assertion failures into debugger break-points.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
+"      Turn assertion failures into C++ exceptions.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
+"      Do not report exceptions as test failures. Instead, allow them\n"
+"      to crash the program or throw a pop-up (on Windows).\n"
+"\n"
+"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "the corresponding\n"
+"environment variable of a flag (all letters in upper-case). For example, to\n"
+"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "color=no@D or set\n"
+"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
+"\n"
+"For more information, please read the " GTEST_NAME_ " documentation at\n"
+"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
+"(not one in your own code or tests), please report it to\n"
+"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.  The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  for (int i = 1; i < *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    using internal::ParseBoolFlag;
+    using internal::ParseInt32Flag;
+    using internal::ParseStringFlag;
+
+    // Do we see a Google Test flag?
+    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                      &GTEST_FLAG(also_run_disabled_tests)) ||
+        ParseBoolFlag(arg, kBreakOnFailureFlag,
+                      &GTEST_FLAG(break_on_failure)) ||
+        ParseBoolFlag(arg, kCatchExceptionsFlag,
+                      &GTEST_FLAG(catch_exceptions)) ||
+        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+        ParseStringFlag(arg, kDeathTestStyleFlag,
+                        &GTEST_FLAG(death_test_style)) ||
+        ParseBoolFlag(arg, kDeathTestUseFork,
+                      &GTEST_FLAG(death_test_use_fork)) ||
+        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+        ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                        &GTEST_FLAG(internal_run_death_test)) ||
+        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+        ParseInt32Flag(arg, kStackTraceDepthFlag,
+                       &GTEST_FLAG(stack_trace_depth)) ||
+        ParseStringFlag(arg, kStreamResultToFlag,
+                        &GTEST_FLAG(stream_result_to)) ||
+        ParseBoolFlag(arg, kThrowOnFailureFlag,
+                      &GTEST_FLAG(throw_on_failure))
+        ) {
+      // Yes.  Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+  }
+
+  if (g_help_flag) {
+    // We print the help here instead of in RUN_ALL_TESTS(), as the
+    // latter may not be called at all if the user is using Google
+    // Test with another testing framework.
+    PrintColorEncoded(kColorEncodedHelpMessage);
+  }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int* argc, CharType** argv) {
+  g_init_gtest_count++;
+
+  // We don't want to run the initialization code twice.
+  if (g_init_gtest_count != 1) return;
+
+  if (*argc <= 0) return;
+
+  internal::g_executable_path = internal::StreamableToString(argv[0]);
+
+#if GTEST_HAS_DEATH_TEST
+
+  g_argvs.clear();
+  for (int i = 0; i != *argc; i++) {
+    g_argvs.push_back(StreamableToString(argv[i]));
+  }
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+  ParseGoogleTestFlagsOnly(argc, argv);
+  GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+}  // namespace internal
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int* argc, char** argv) {
+  internal::InitGoogleTestImpl(argc, argv);
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int* argc, wchar_t** argv) {
+  internal::InitGoogleTestImpl(argc, argv);
+}
+
+}  // namespace testing
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
+//
+// This file implements death tests.
+
+
+#if GTEST_HAS_DEATH_TEST
+
+# if GTEST_OS_MAC
+#  include <crt_externs.h>
+# endif  // GTEST_OS_MAC
+
+# include <errno.h>
+# include <fcntl.h>
+# include <limits.h>
+
+# if GTEST_OS_LINUX
+#  include <signal.h>
+# endif  // GTEST_OS_LINUX
+
+# include <stdarg.h>
+
+# if GTEST_OS_WINDOWS
+#  include <windows.h>
+# else
+#  include <sys/mman.h>
+#  include <sys/wait.h>
+# endif  // GTEST_OS_WINDOWS
+
+# if GTEST_OS_QNX
+#  include <spawn.h>
+# endif  // GTEST_OS_QNX
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+static const char kDefaultDeathTestStyle[] = "fast";
+
+GTEST_DEFINE_string_(
+    death_test_style,
+    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    "Indicates how to run a death test in a forked child process: "
+    "\"threadsafe\" (child process re-executes the test binary "
+    "from the beginning, running only the specific death test) or "
+    "\"fast\" (child process runs the death test immediately "
+    "after forking).");
+
+GTEST_DEFINE_bool_(
+    death_test_use_fork,
+    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    "Instructs to use fork()/_exit() instead of clone() in death tests. "
+    "Ignored and always uses fork() on POSIX systems where clone() is not "
+    "implemented. Useful when running under valgrind or similar tools if "
+    "those do not support clone(). Valgrind 3.3.1 will just fail if "
+    "it sees an unsupported combination of clone() flags. "
+    "It is not recommended to use this flag w/o valgrind though it will "
+    "work in 99% of the cases. Once valgrind is fixed, this flag will "
+    "most likely be removed.");
+
+namespace internal {
+GTEST_DEFINE_string_(
+    internal_run_death_test, "",
+    "Indicates the file, line number, temporal index of "
+    "the single death test to run, and a file descriptor to "
+    "which a success code may be sent, all separated by "
+    "the '|' characters.  This flag is specified if and only if the current "
+    "process is a sub-process launched for running a thread-safe "
+    "death test.  FOR INTERNAL USE ONLY.");
+}  // namespace internal
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+static bool g_in_fast_death_test_child = false;
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+bool InDeathTestChild() {
+# if GTEST_OS_WINDOWS
+
+  // On Windows, death tests are thread-safe regardless of the value of the
+  // death_test_style flag.
+  return !GTEST_FLAG(internal_run_death_test).empty();
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe")
+    return !GTEST_FLAG(internal_run_death_test).empty();
+  else
+    return g_in_fast_death_test_child;
+#endif
+}
+
+}  // namespace internal
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
+}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+# if GTEST_OS_WINDOWS
+
+  return exit_status == exit_code_;
+
+# else
+
+  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+# endif  // GTEST_OS_WINDOWS
+}
+
+# if !GTEST_OS_WINDOWS
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
+}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+# endif  // !GTEST_OS_WINDOWS
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static std::string ExitSummary(int exit_code) {
+  Message m;
+
+# if GTEST_OS_WINDOWS
+
+  m << "Exited with exit status " << exit_code;
+
+# else
+
+  if (WIFEXITED(exit_code)) {
+    m << "Exited with exit status " << WEXITSTATUS(exit_code);
+  } else if (WIFSIGNALED(exit_code)) {
+    m << "Terminated by signal " << WTERMSIG(exit_code);
+  }
+#  ifdef WCOREDUMP
+  if (WCOREDUMP(exit_code)) {
+    m << " (core dumped)";
+  }
+#  endif
+# endif  // GTEST_OS_WINDOWS
+
+  return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+  return !ExitedWithCode(0)(exit_status);
+}
+
+# if !GTEST_OS_WINDOWS
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement.  It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static std::string DeathTestThreadWarning(size_t thread_count) {
+  Message msg;
+  msg << "Death tests use fork(), which is unsafe particularly"
+      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+  if (thread_count == 0)
+    msg << "couldn't detect the number of threads.";
+  else
+    msg << "detected " << thread_count << " threads.";
+  return msg.GetString();
+}
+# endif  // !GTEST_OS_WINDOWS
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude.  DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception.  IN_PROGRESS means the test
+// has not yet concluded.
+// TODO(vladl@google.com): Unify names and possibly values for
+// AbortReason, DeathTestOutcome, and flag characters above.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process.  Otherwise, the
+// message is simply printed to stderr.  In either case, the program
+// then exits with status 1.
+void DeathTestAbort(const std::string& message) {
+  // On a POSIX system, this function may be called from a threadsafe-style
+  // death test child process, which operates on a very small stack.  Use
+  // the heap for any additional non-minuscule memory requirements.
+  const InternalRunDeathTestFlag* const flag =
+      GetUnitTestImpl()->internal_run_death_test_flag();
+  if (flag != NULL) {
+    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+    fputc(kDeathTestInternalError, parent);
+    fprintf(parent, "%s", message.c_str());
+    fflush(parent);
+    _exit(1);
+  } else {
+    fprintf(stderr, "%s", message.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+# define GTEST_DEATH_TEST_CHECK_(expression) \
+  do { \
+    if (!::testing::internal::IsTrue(expression)) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again.  The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR.  If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+  do { \
+    int gtest_retval; \
+    do { \
+      gtest_retval = (expression); \
+    } while (gtest_retval == -1 && errno == EINTR); \
+    if (gtest_retval == -1) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression + " != -1"); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+std::string GetLastErrnoDescription() {
+    return errno == 0 ? "" : posix::StrError(errno);
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+  Message error;
+  char buffer[256];
+  int num_read;
+
+  do {
+    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+      buffer[num_read] = '\0';
+      error << buffer;
+    }
+  } while (num_read == -1 && errno == EINTR);
+
+  if (num_read == 0) {
+    GTEST_LOG_(FATAL) << error.GetString();
+  } else {
+    const int last_error = errno;
+    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+                      << GetLastErrnoDescription() << " [" << last_error << "]";
+  }
+}
+
+// Death test constructor.  Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+  TestInfo* const info = GetUnitTestImpl()->current_test_info();
+  if (info == NULL) {
+    DeathTestAbort("Cannot run a death test outside of a TEST or "
+                   "TEST_F construct");
+  }
+}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char* statement, const RE* regex,
+                       const char* file, int line, DeathTest** test) {
+  return GetUnitTestImpl()->death_test_factory()->Create(
+      statement, regex, file, line, test);
+}
+
+const char* DeathTest::LastMessage() {
+  return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const std::string& message) {
+  last_death_test_message_ = message;
+}
+
+std::string DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+  DeathTestImpl(const char* a_statement, const RE* a_regex)
+      : statement_(a_statement),
+        regex_(a_regex),
+        spawned_(false),
+        status_(-1),
+        outcome_(IN_PROGRESS),
+        read_fd_(-1),
+        write_fd_(-1) {}
+
+  // read_fd_ is expected to be closed and cleared by a derived class.
+  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+  void Abort(AbortReason reason);
+  virtual bool Passed(bool status_ok);
+
+  const char* statement() const { return statement_; }
+  const RE* regex() const { return regex_; }
+  bool spawned() const { return spawned_; }
+  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+  int status() const { return status_; }
+  void set_status(int a_status) { status_ = a_status; }
+  DeathTestOutcome outcome() const { return outcome_; }
+  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+  int read_fd() const { return read_fd_; }
+  void set_read_fd(int fd) { read_fd_ = fd; }
+  int write_fd() const { return write_fd_; }
+  void set_write_fd(int fd) { write_fd_ = fd; }
+
+  // Called in the parent process only. Reads the result code of the death
+  // test child process via a pipe, interprets it to set the outcome_
+  // member, and closes read_fd_.  Outputs diagnostics and terminates in
+  // case of unexpected codes.
+  void ReadAndInterpretStatusByte();
+
+ private:
+  // The textual content of the code this object is testing.  This class
+  // doesn't own this string and should not attempt to delete it.
+  const char* const statement_;
+  // The regular expression which test output must match.  DeathTestImpl
+  // doesn't own this object and should not attempt to delete it.
+  const RE* const regex_;
+  // True if the death test child process has been successfully spawned.
+  bool spawned_;
+  // The exit status of the child process.
+  int status_;
+  // How the death test concluded.
+  DeathTestOutcome outcome_;
+  // Descriptor to the read end of the pipe to the child process.  It is
+  // always -1 in the child process.  The child keeps its write end of the
+  // pipe in write_fd_.
+  int read_fd_;
+  // Descriptor to the child's write end of the pipe to the parent process.
+  // It is always -1 in the parent process.  The parent keeps its end of the
+  // pipe in read_fd_.
+  int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_.  Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+  char flag;
+  int bytes_read;
+
+  // The read() here blocks until data is available (signifying the
+  // failure of the death test) or until the pipe is closed (signifying
+  // its success), so it's okay to call this in the parent before
+  // the child process has exited.
+  do {
+    bytes_read = posix::Read(read_fd(), &flag, 1);
+  } while (bytes_read == -1 && errno == EINTR);
+
+  if (bytes_read == 0) {
+    set_outcome(DIED);
+  } else if (bytes_read == 1) {
+    switch (flag) {
+      case kDeathTestReturned:
+        set_outcome(RETURNED);
+        break;
+      case kDeathTestThrew:
+        set_outcome(THREW);
+        break;
+      case kDeathTestLived:
+        set_outcome(LIVED);
+        break;
+      case kDeathTestInternalError:
+        FailFromInternalError(read_fd());  // Does not return.
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Death test child process reported "
+                          << "unexpected status byte ("
+                          << static_cast<unsigned int>(flag) << ")";
+    }
+  } else {
+    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+                      << GetLastErrnoDescription();
+  }
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+  set_read_fd(-1);
+}
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+  // The parent process considers the death test to be a failure if
+  // it finds any data in our pipe.  So, here we write a single flag byte
+  // to the pipe, then exit.
+  const char status_ch =
+      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
+      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+  // We are leaking the descriptor here because on some platforms (i.e.,
+  // when built as Windows DLL), destructors of global objects will still
+  // run after calling _exit(). On such systems, write_fd_ will be
+  // indirectly closed from the destructor of UnitTestImpl, causing double
+  // close if it is also closed here. On debug configurations, double close
+  // may assert. As there are no in-process buffers to flush here, we are
+  // relying on the OS to close the descriptor after the process terminates
+  // when the destructors are not run.
+  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+  ::std::string ret;
+  for (size_t at = 0; ; ) {
+    const size_t line_end = output.find('\n', at);
+    ret += "[  DEATH   ] ";
+    if (line_end == ::std::string::npos) {
+      ret += output.substr(at);
+      break;
+    }
+    ret += output.substr(at, line_end + 1 - at);
+    at = line_end + 1;
+  }
+  return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+//   outcome:  An enumeration describing how the death test
+//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
+//             fails in the latter three cases.
+//   status:   The exit status of the child process. On *nix, it is in the
+//             in the format specified by wait(2). On Windows, this is the
+//             value supplied to the ExitProcess() API or a numeric code
+//             of the exception that terminated the program.
+//   regex:    A regular expression object to be applied to
+//             the test's captured standard error output; the death test
+//             fails if it does not match.
+//
+// Argument:
+//   status_ok: true if exit_status is acceptable in the context of
+//              this particular death test, which fails if it is false
+//
+// Returns true iff all of the above conditions are met.  Otherwise, the
+// first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+  if (!spawned())
+    return false;
+
+  const std::string error_message = GetCapturedStderr();
+
+  bool success = false;
+  Message buffer;
+
+  buffer << "Death test: " << statement() << "\n";
+  switch (outcome()) {
+    case LIVED:
+      buffer << "    Result: failed to die.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case THREW:
+      buffer << "    Result: threw an exception.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case RETURNED:
+      buffer << "    Result: illegal return in test statement.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case DIED:
+      if (status_ok) {
+        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
+        if (matched) {
+          success = true;
+        } else {
+          buffer << "    Result: died but not with expected error.\n"
+                 << "  Expected: " << regex()->pattern() << "\n"
+                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+        }
+      } else {
+        buffer << "    Result: died but not with expected exit code:\n"
+               << "            " << ExitSummary(status()) << "\n"
+               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+      }
+      break;
+    case IN_PROGRESS:
+    default:
+      GTEST_LOG_(FATAL)
+          << "DeathTest::Passed somehow called before conclusion of test";
+  }
+
+  DeathTest::set_last_death_test_message(buffer.GetString());
+  return success;
+}
+
+# if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes:  Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+//    ends of it.
+// 2. The parent starts the child and provides it with the information
+//    necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+//    using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+//    this is done before step 3, the object's reference count goes down to
+//    0 and it is destroyed, preventing the child from acquiring it. The
+//    parent now has to release it, or read operations on the read end of
+//    the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+//    any possible error messages) from the pipe, and its stderr and then
+//    determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+  WindowsDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // Handle to the write end of the pipe to the child process.
+  AutoHandle write_handle_;
+  // Child process handle.
+  AutoHandle child_handle_;
+  // Event the child process uses to signal the parent that it has
+  // acquired the handle to the write end of the pipe. After seeing this
+  // event the parent can release its own handles to make sure its
+  // ReadFile() calls return when the child terminates.
+  AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Wait until the child either signals that it has acquired the write end
+  // of the pipe or it dies.
+  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
+  switch (::WaitForMultipleObjects(2,
+                                   wait_handles,
+                                   FALSE,  // Waits for any of the handles.
+                                   INFINITE)) {
+    case WAIT_OBJECT_0:
+    case WAIT_OBJECT_0 + 1:
+      break;
+    default:
+      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+  }
+
+  // The child has acquired the write end of the pipe or exited.
+  // We release the handle on our side and continue.
+  write_handle_.Reset();
+  event_handle_.Reset();
+
+  ReadAndInterpretStatusByte();
+
+  // Waits for the child process to exit if it haven't already. This
+  // returns immediately if the child has already exited, regardless of
+  // whether previous calls to WaitForMultipleObjects synchronized on this
+  // handle or not.
+  GTEST_DEATH_TEST_CHECK_(
+      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
+                                             INFINITE));
+  DWORD status_code;
+  GTEST_DEATH_TEST_CHECK_(
+      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+  child_handle_.Reset();
+  set_status(static_cast<int>(status_code));
+  return status();
+}
+
+// The AssumeRole process for a Windows death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  // WindowsDeathTest uses an anonymous pipe to communicate results of
+  // a death test.
+  SECURITY_ATTRIBUTES handles_are_inheritable = {
+    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
+  HANDLE read_handle, write_handle;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
+                   0)  // Default buffer size.
+      != FALSE);
+  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
+                                O_RDONLY));
+  write_handle_.Reset(write_handle);
+  event_handle_.Reset(::CreateEvent(
+      &handles_are_inheritable,
+      TRUE,    // The event will automatically reset to non-signaled state.
+      FALSE,   // The initial state is non-signalled.
+      NULL));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" +
+      info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
+      "=" + file_ + "|" + StreamableToString(line_) + "|" +
+      StreamableToString(death_test_index) + "|" +
+      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+      // size_t has the same width as pointers on both 32-bit and 64-bit
+      // Windows platforms.
+      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
+      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(
+      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
+                                            executable_path,
+                                            _MAX_PATH));
+
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
+      executable_path,
+      const_cast<char*>(command_line.c_str()),
+      NULL,   // Retuned process handle is not inheritable.
+      NULL,   // Retuned thread handle is not inheritable.
+      TRUE,   // Child inherits all inheritable handles (for write_handle_).
+      0x0,    // Default creation flags.
+      NULL,   // Inherit the parent's environment.
+      UnitTest::GetInstance()->original_working_dir(),
+      &startup_info,
+      &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+# else  // We are not on Windows.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface.  Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+  ForkingDeathTest(const char* statement, const RE* regex);
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+
+ protected:
+  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+  // PID of child process during death test; 0 in the child process itself.
+  pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
+    : DeathTestImpl(a_statement, a_regex),
+      child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  ReadAndInterpretStatusByte();
+
+  int status_value;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+  set_status(status_value);
+  return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
+      ForkingDeathTest(a_statement, a_regex) { }
+  virtual TestRole AssumeRole();
+};
+
+// The AssumeRole process for a fork-and-run death test.  It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+  const size_t thread_count = GetThreadCount();
+  if (thread_count != 1) {
+    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+  DeathTest::set_last_death_test_message("");
+  CaptureStderr();
+  // When we fork the process below, the log file buffers are copied, but the
+  // file descriptors are shared.  We flush all log files here so that closing
+  // the file descriptors in the child process doesn't throw off the
+  // synchronization between descriptors and buffers in the parent process.
+  // This is as close to the fork as possible to avoid a race condition in case
+  // there are multiple threads running before the death test, and another
+  // thread writes to the log file.
+  FlushInfoLog();
+
+  const pid_t child_pid = fork();
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  set_child_pid(child_pid);
+  if (child_pid == 0) {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+    set_write_fd(pipe_fd[1]);
+    // Redirects all logging to stderr in the child process to prevent
+    // concurrent writes to the log files.  We capture stderr in the parent
+    // process and append the child process' output to a log.
+    LogToStderr();
+    // Event forwarding to the listeners of event listener API mush be shut
+    // down in death test subprocesses.
+    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    g_in_fast_death_test_child = true;
+    return EXECUTE_TEST;
+  } else {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+    set_read_fd(pipe_fd[0]);
+    set_spawned(true);
+    return OVERSEE_TEST;
+  }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+  ExecDeathTest(const char* a_statement, const RE* a_regex,
+                const char* file, int line) :
+      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
+  virtual TestRole AssumeRole();
+ private:
+  static ::std::vector<testing::internal::string>
+  GetArgvsForDeathTestChildProcess() {
+    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
+    return args;
+  }
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+  char* const* argv;  // Command-line arguments for the child's call to exec
+  int close_fd;       // File descriptor to close; the read end of a pipe
+};
+
+#  if GTEST_OS_MAC
+inline char** GetEnviron() {
+  // When Google Test is built as a framework on MacOS X, the environ variable
+  // is unavailable. Apple's documentation (man environ) recommends using
+  // _NSGetEnviron() instead.
+  return *_NSGetEnviron();
+}
+#  else
+// Some POSIX platforms expect you to declare environ. extern "C" makes
+// it reside in the global namespace.
+extern "C" char** environ;
+inline char** GetEnviron() { return environ; }
+#  endif  // GTEST_OS_MAC
+
+#  if !GTEST_OS_QNX
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void* child_arg) {
+  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  // We can safely call execve() as it's a direct system call.  We
+  // cannot use execvp() as it's a libc function and thus potentially
+  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // invoke the test program via a valid path that contains at least
+  // one path separator.
+  execve(args->argv[0], args->argv, GetEnviron());
+  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " +
+                 GetLastErrnoDescription());
+  return EXIT_FAILURE;
+}
+#  endif  // !GTEST_OS_QNX
+
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
+void StackLowerThanAddress(const void* ptr, bool* result) {
+  int dummy;
+  *result = (&dummy < ptr);
+}
+
+bool StackGrowsDown() {
+  int dummy;
+  bool result;
+  StackLowerThanAddress(&dummy, &result);
+  return result;
+}
+
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test.  The
+// implementation uses fork(2) + exec.  On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe.  On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead.  The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
+  ExecDeathTestArgs args = { argv, close_fd };
+  pid_t child_pid = -1;
+
+#  if GTEST_OS_QNX
+  // Obtains the current directory and sets it to be closed in the child
+  // process.
+  const int cwd_fd = open(".", O_RDONLY);
+  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  int fd_flags;
+  // Set close_fd to be closed after spawn.
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
+                                        fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = {0};
+  // spawn is a system call.
+  child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron());
+  // Restores the current working directory.
+  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#  else   // GTEST_OS_QNX
+#   if GTEST_OS_LINUX
+  // When a SIGPROF signal is received while fork() or clone() are executing,
+  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+  // it after the call to fork()/clone() is complete.
+  struct sigaction saved_sigprof_action;
+  struct sigaction ignore_sigprof_action;
+  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+  sigemptyset(&ignore_sigprof_action.sa_mask);
+  ignore_sigprof_action.sa_handler = SIG_IGN;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
+      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#   endif  // GTEST_OS_LINUX
+
+#   if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+
+  if (!use_fork) {
+    static const bool stack_grows_down = StackGrowsDown();
+    const size_t stack_size = getpagesize();
+    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+                             MAP_ANON | MAP_PRIVATE, -1, 0);
+    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+    // Maximum stack alignment in bytes:  For a downward-growing stack, this
+    // amount is subtracted from size of the stack space to get an address
+    // that is within the stack space and is aligned on all systems we care
+    // about.  As far as I know there is no ABI with stack alignment greater
+    // than 64.  We assume stack and stack_size already have alignment of
+    // kMaxStackAlignment.
+    const size_t kMaxStackAlignment = 64;
+    void* const stack_top =
+        static_cast<char*>(stack) +
+            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment &&
+        reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0);
+
+    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+  }
+#   else
+  const bool use_fork = true;
+#   endif  // GTEST_HAS_CLONE
+
+  if (use_fork && (child_pid = fork()) == 0) {
+      ExecDeathTestChildMain(&args);
+      _exit(0);
+  }
+#  endif  // GTEST_OS_QNX
+#  if GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &saved_sigprof_action, NULL));
+#  endif  // GTEST_OS_LINUX
+
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test.  It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+  // Clear the close-on-exec flag on the write end of the pipe, lest
+  // it be closed when the child process does an exec:
+  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|" + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index) + "|"
+      + StreamableToString(pipe_fd[1]);
+  Arguments args;
+  args.AddArguments(GetArgvsForDeathTestChildProcess());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // See the comment in NoExecDeathTest::AssumeRole for why the next line
+  // is necessary.
+  FlushInfoLog();
+
+  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+  set_child_pid(child_pid);
+  set_read_fd(pipe_fd[0]);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# endif  // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address.  If the test should be
+// skipped, sets that pointer to NULL.  Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
+                                     const char* file, int line,
+                                     DeathTest** test) {
+  UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const int death_test_index = impl->current_test_info()
+      ->increment_death_test_count();
+
+  if (flag != NULL) {
+    if (death_test_index > flag->index()) {
+      DeathTest::set_last_death_test_message(
+          "Death test count (" + StreamableToString(death_test_index)
+          + ") somehow exceeded expected maximum ("
+          + StreamableToString(flag->index()) + ")");
+      return false;
+    }
+
+    if (!(flag->file() == file && flag->line() == line &&
+          flag->index() == death_test_index)) {
+      *test = NULL;
+      return true;
+    }
+  }
+
+# if GTEST_OS_WINDOWS
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new WindowsDeathTest(statement, regex, file, line);
+  }
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+    *test = new ExecDeathTest(statement, regex, file, line);
+  } else if (GTEST_FLAG(death_test_style) == "fast") {
+    *test = new NoExecDeathTest(statement, regex);
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
+    DeathTest::set_last_death_test_message(
+        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
+        + "\" encountered");
+    return false;
+  }
+
+  return true;
+}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
+// ::std::string, so we can use it here.
+static void SplitString(const ::std::string& str, char delimiter,
+                        ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
+# if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+int GetStatusFileDescriptor(unsigned int parent_process_id,
+                            size_t write_handle_as_size_t,
+                            size_t event_handle_as_size_t) {
+  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+                                                   FALSE,  // Non-inheritable.
+                                                   parent_process_id));
+  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+    DeathTestAbort("Unable to open parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  // TODO(vladl@google.com): Replace the following check with a
+  // compile-time assertion when available.
+  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+  const HANDLE write_handle =
+      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  HANDLE dup_write_handle;
+
+  // The newly initialized handle is accessible only in in the parent
+  // process. To obtain one accessible within the child, we need to use
+  // DuplicateHandle.
+  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+                         ::GetCurrentProcess(), &dup_write_handle,
+                         0x0,    // Requested privileges ignored since
+                                 // DUPLICATE_SAME_ACCESS is used.
+                         FALSE,  // Request non-inheritable handler.
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+  HANDLE dup_event_handle;
+
+  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+                         ::GetCurrentProcess(), &dup_event_handle,
+                         0x0,
+                         FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the event handle " +
+                   StreamableToString(event_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const int write_fd =
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+  if (write_fd == -1) {
+    DeathTestAbort("Unable to convert pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " to a file descriptor");
+  }
+
+  // Signals the parent that the write end of the pipe has been acquired
+  // so the parent can release its own write end.
+  ::SetEvent(dup_event_handle);
+
+  return write_fd;
+}
+# endif  // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
+
+  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+  // can use it here.
+  int line = -1;
+  int index = -1;
+  ::std::vector< ::std::string> fields;
+  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  int write_fd = -1;
+
+# if GTEST_OS_WINDOWS
+
+  unsigned int parent_process_id = 0;
+  size_t write_handle_as_size_t = 0;
+  size_t event_handle_as_size_t = 0;
+
+  if (fields.size() != 6
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &parent_process_id)
+      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
+      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+  write_fd = GetStatusFileDescriptor(parent_process_id,
+                                     write_handle_as_size_t,
+                                     event_handle_as_size_t);
+# else
+
+  if (fields.size() != 4
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+}  // namespace internal
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: keith.ray@gmail.com (Keith Ray)
+
+
+#include <stdlib.h>
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>
+#elif GTEST_OS_WINDOWS
+# include <direct.h>
+# include <io.h>
+#elif GTEST_OS_SYMBIAN
+// Symbian OpenC has PATH_MAX in sys/syslimits.h
+# include <sys/syslimits.h>
+#else
+# include <limits.h>
+# include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+# define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif  // GTEST_OS_WINDOWS
+
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+// On Windows, '\\' is the standard path separator, but many tools and the
+// Windows API also accept '/' as an alternate path separator. Unless otherwise
+// noted, a file path can contain either kind of path separators, or a mixture
+// of them.
+const char kPathSeparator = '\\';
+const char kAlternatePathSeparator = '/';
+//const char kPathSeparatorString[] = "\\";
+const char kAlternatePathSeparatorString[] = "/";
+# if GTEST_OS_WINDOWS_MOBILE
+// Windows CE doesn't have a current directory. You should not use
+// the current directory in tests on Windows CE, but this at least
+// provides a reasonable fallback.
+const char kCurrentDirectoryString[] = "\\";
+// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
+const DWORD kInvalidFileAttributes = 0xffffffff;
+# else
+const char kCurrentDirectoryString[] = ".\\";
+# endif  // GTEST_OS_WINDOWS_MOBILE
+#else
+const char kPathSeparator = '/';
+//const char kPathSeparatorString[] = "/";
+const char kCurrentDirectoryString[] = "./";
+#endif  // GTEST_OS_WINDOWS
+
+// Returns whether the given character is a valid path separator.
+static bool IsPathSeparator(char c) {
+#if GTEST_HAS_ALT_PATH_SEP_
+  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
+#else
+  return c == kPathSeparator;
+#endif
+}
+
+// Returns the current working directory, or "" if unsuccessful.
+FilePath FilePath::GetCurrentDir() {
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE doesn't have a current directory, so we just return
+  // something reasonable.
+  return FilePath(kCurrentDirectoryString);
+#elif GTEST_OS_WINDOWS
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+#else
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns a copy of the FilePath with the case-insensitive extension removed.
+// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+// FilePath("dir/file"). If a case-insensitive extension is not
+// found, returns a copy of the original FilePath.
+FilePath FilePath::RemoveExtension(const char* extension) const {
+  const std::string dot_extension = std::string(".") + extension;
+  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+    return FilePath(pathname_.substr(
+        0, pathname_.length() - dot_extension.length()));
+  }
+  return *this;
+}
+
+// Returns a pointer to the last occurrence of a valid path separator in
+// the FilePath. On Windows, for example, both '/' and '\' are valid path
+// separators. Returns NULL if no path separator was found.
+const char* FilePath::FindLastPathSeparator() const {
+  const char* const last_sep = strrchr(c_str(), kPathSeparator);
+#if GTEST_HAS_ALT_PATH_SEP_
+  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  // Comparing two pointers of which only one is NULL is undefined.
+  if (last_alt_sep != NULL &&
+      (last_sep == NULL || last_alt_sep > last_sep)) {
+    return last_alt_sep;
+  }
+#endif
+  return last_sep;
+}
+
+// Returns a copy of the FilePath with the directory part removed.
+// Example: FilePath("path/to/file").RemoveDirectoryName() returns
+// FilePath("file"). If there is no directory part ("just_a_file"), it returns
+// the FilePath unmodified. If there is no file part ("just_a_dir/") it
+// returns an empty FilePath ("").
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveDirectoryName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  return last_sep ? FilePath(last_sep + 1) : *this;
+}
+
+// RemoveFileName returns the directory path with the filename removed.
+// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveFileName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  std::string dir;
+  if (last_sep) {
+    dir = std::string(c_str(), last_sep + 1 - c_str());
+  } else {
+    dir = kCurrentDirectoryString;
+  }
+  return FilePath(dir);
+}
+
+// Helper functions for naming files in a directory for xml output.
+
+// Given directory = "dir", base_name = "test", number = 0,
+// extension = "xml", returns "dir/test.xml". If number is greater
+// than zero (e.g., 12), returns "dir/test_12.xml".
+// On Windows platform, uses \ as the separator rather than /.
+FilePath FilePath::MakeFileName(const FilePath& directory,
+                                const FilePath& base_name,
+                                int number,
+                                const char* extension) {
+  std::string file;
+  if (number == 0) {
+    file = base_name.string() + "." + extension;
+  } else {
+    file = base_name.string() + "_" + StreamableToString(number)
+        + "." + extension;
+  }
+  return ConcatPaths(directory, FilePath(file));
+}
+
+// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
+// On Windows, uses \ as the separator rather than /.
+FilePath FilePath::ConcatPaths(const FilePath& directory,
+                               const FilePath& relative_path) {
+  if (directory.IsEmpty())
+    return relative_path;
+  const FilePath dir(directory.RemoveTrailingPathSeparator());
+  return FilePath(dir.string() + kPathSeparator + relative_path.string());
+}
+
+// Returns true if pathname describes something findable in the file-system,
+// either a file, directory, or whatever.
+bool FilePath::FileOrDirectoryExists() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  return attributes != kInvalidFileAttributes;
+#else
+  posix::StatStruct file_stat;
+  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns true if pathname describes a directory in the file-system
+// that exists.
+bool FilePath::DirectoryExists() const {
+  bool result = false;
+#if GTEST_OS_WINDOWS
+  // Don't strip off trailing separator if path is a root directory on
+  // Windows (like "C:\\").
+  const FilePath& path(IsRootDirectory() ? *this :
+                                           RemoveTrailingPathSeparator());
+#else
+  const FilePath& path(*this);
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  if ((attributes != kInvalidFileAttributes) &&
+      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    result = true;
+  }
+#else
+  posix::StatStruct file_stat;
+  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
+      posix::IsDir(file_stat);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  return result;
+}
+
+// Returns true if pathname describes a root directory. (Windows has one
+// root directory per disk drive.)
+bool FilePath::IsRootDirectory() const {
+#if GTEST_OS_WINDOWS
+  // TODO(wan@google.com): on Windows a network share like
+  // \\server\share can be a root directory, although it cannot be the
+  // current directory.  Handle this properly.
+  return pathname_.length() == 3 && IsAbsolutePath();
+#else
+  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
+#endif
+}
+
+// Returns true if pathname describes an absolute path.
+bool FilePath::IsAbsolutePath() const {
+  const char* const name = pathname_.c_str();
+#if GTEST_OS_WINDOWS
+  return pathname_.length() >= 3 &&
+     ((name[0] >= 'a' && name[0] <= 'z') ||
+      (name[0] >= 'A' && name[0] <= 'Z')) &&
+     name[1] == ':' &&
+     IsPathSeparator(name[2]);
+#else
+  return IsPathSeparator(name[0]);
+#endif
+}
+
+// Returns a pathname for a file that does not currently exist. The pathname
+// will be directory/base_name.extension or
+// directory/base_name_<number>.extension if directory/base_name.extension
+// already exists. The number will be incremented until a pathname is found
+// that does not already exist.
+// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+// There could be a race condition if two or more processes are calling this
+// function at the same time -- they could both pick the same filename.
+FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
+                                          const FilePath& base_name,
+                                          const char* extension) {
+  FilePath full_pathname;
+  int number = 0;
+  do {
+    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
+  } while (full_pathname.FileOrDirectoryExists());
+  return full_pathname;
+}
+
+// Returns true if FilePath ends with a path separator, which indicates that
+// it is intended to represent a directory. Returns false otherwise.
+// This does NOT check that a directory (or file) actually exists.
+bool FilePath::IsDirectory() const {
+  return !pathname_.empty() &&
+         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
+}
+
+// Create directories so that path exists. Returns true if successful or if
+// the directories already exist; returns false if unable to create directories
+// for any reason.
+bool FilePath::CreateDirectoriesRecursively() const {
+  if (!this->IsDirectory()) {
+    return false;
+  }
+
+  if (pathname_.length() == 0 || this->DirectoryExists()) {
+    return true;
+  }
+
+  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
+  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
+}
+
+// Create the directory so that path exists. Returns true if successful or
+// if the directory already exists; returns false if unable to create the
+// directory for any reason, including if the parent directory does not
+// exist. Not named "CreateDirectory" because that's a macro on Windows.
+bool FilePath::CreateFolder() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  FilePath removed_sep(this->RemoveTrailingPathSeparator());
+  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
+  int result = CreateDirectory(unicode, NULL) ? 0 : -1;
+  delete [] unicode;
+#elif GTEST_OS_WINDOWS
+  int result = _mkdir(pathname_.c_str());
+#else
+  int result = mkdir(pathname_.c_str(), 0777);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  if (result == -1) {
+    return this->DirectoryExists();  // An error is OK if the directory exists.
+  }
+  return true;  // No error.
+}
+
+// If input name has a trailing separator character, remove it and return the
+// name, otherwise return the name string unmodified.
+// On Windows platform, uses \ as the separator, other platforms use /.
+FilePath FilePath::RemoveTrailingPathSeparator() const {
+  return IsDirectory()
+      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+      : *this;
+}
+
+// Removes any redundant separators that might be in the pathname.
+// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+// redundancies that might be in a pathname involving "." or "..".
+// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
+void FilePath::Normalize() {
+  if (pathname_.c_str() == NULL) {
+    pathname_ = "";
+    return;
+  }
+  const char* src = pathname_.c_str();
+  char* const dest = new char[pathname_.length() + 1];
+  char* dest_ptr = dest;
+  memset(dest_ptr, 0, pathname_.length() + 1);
+
+  while (*src != '\0') {
+    *dest_ptr = *src;
+    if (!IsPathSeparator(*src)) {
+      src++;
+    } else {
+#if GTEST_HAS_ALT_PATH_SEP_
+      if (*dest_ptr == kAlternatePathSeparator) {
+        *dest_ptr = kPathSeparator;
+      }
+#endif
+      while (IsPathSeparator(*src))
+        src++;
+    }
+    dest_ptr++;
+  }
+  *dest_ptr = '\0';
+  pathname_ = dest;
+  delete[] dest;
+}
+
+}  // namespace internal
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>  // For TerminateProcess()
+#elif GTEST_OS_WINDOWS
+# include <io.h>
+# include <sys/stat.h>
+#else
+# include <unistd.h>
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#if GTEST_OS_MAC
+# include <mach/mach_init.h>
+# include <mach/task.h>
+# include <mach/vm_map.h>
+#endif  // GTEST_OS_MAC
+
+#if GTEST_OS_QNX
+# include <devctl.h>
+# include <sys/procfs.h>
+#endif  // GTEST_OS_QNX
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+namespace internal {
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // _MSC_VER
+
+#if GTEST_OS_MAC
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const task_t task = mach_task_self();
+  mach_msg_type_number_t thread_count;
+  thread_act_array_t thread_list;
+  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+  if (status == KERN_SUCCESS) {
+    // task_threads allocates resources in thread_list and we need to free them
+    // to avoid leaks.
+    vm_deallocate(task,
+                  reinterpret_cast<vm_address_t>(thread_list),
+                  sizeof(thread_t) * thread_count);
+    return static_cast<size_t>(thread_count);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const int fd = open("/proc/self/as", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+  procfs_info process_info;
+  const int status =
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL);
+  close(fd);
+  if (status == EOK) {
+    return static_cast<size_t>(process_info.num_threads);
+  } else {
+    return 0;
+  }
+}
+
+#else
+
+size_t GetThreadCount() {
+  // There's no portable way to detect the number of threads, so we just
+  // return 0 to indicate that we cannot detect it.
+  return 0;
+}
+
+#endif  // GTEST_OS_MAC
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE.  Currently only needed for death tests.
+
+RE::~RE() {
+  if (is_valid_) {
+    // regfree'ing an invalid regex might crash because the content
+    // of the regex is undefined. Since the regex's are essentially
+    // the same, one cannot be valid (or invalid) without the other
+    // being so too.
+    regfree(&partial_regex_);
+    regfree(&full_regex_);
+  }
+  free(const_cast<char*>(pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = posix::StrDup(regex);
+
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match.
+  const size_t full_regex_len = strlen(regex) + 10;
+  char* const full_pattern = new char[full_regex_len];
+
+  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+  // We want to call regcomp(&partial_regex_, ...) even if the
+  // previous expression returns false.  Otherwise partial_regex_ may
+  // not be properly initialized can may cause trouble when it's
+  // freed.
+  //
+  // Some implementation of POSIX regex (e.g. on at least some
+  // versions of Cygwin) doesn't accept the empty string as a valid
+  // regex.  We change it to an equivalent form "()" to be safe.
+  if (is_valid_) {
+    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+  }
+  EXPECT_TRUE(is_valid_)
+      << "Regular expression \"" << regex
+      << "\" is not a valid POSIX Extended regular expression.";
+
+  delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true iff ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char* str) {
+  return ch != '\0' && strchr(str, ch) != NULL;
+}
+
+// Returns true iff ch belongs to the given classification.  Unlike
+// similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+      ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true iff "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true iff the given atom (specified by escaped and pattern)
+// matches ch.  The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+  if (escaped) {  // "\\p" where p is pattern_char.
+    switch (pattern_char) {
+      case 'd': return IsAsciiDigit(ch);
+      case 'D': return !IsAsciiDigit(ch);
+      case 'f': return ch == '\f';
+      case 'n': return ch == '\n';
+      case 'r': return ch == '\r';
+      case 's': return IsAsciiWhiteSpace(ch);
+      case 'S': return !IsAsciiWhiteSpace(ch);
+      case 't': return ch == '\t';
+      case 'v': return ch == '\v';
+      case 'w': return IsAsciiWordChar(ch);
+      case 'W': return !IsAsciiWordChar(ch);
+    }
+    return IsAsciiPunct(pattern_char) && pattern_char == ch;
+  }
+
+  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+std::string FormatRegexSyntaxError(const char* regex, int index) {
+  return (Message() << "Syntax error at index " << index
+          << " in simple regular expression \"" << regex << "\": ").GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char* regex) {
+  if (regex == NULL) {
+    // TODO(wan@google.com): fix the source file location in the
+    // assertion failures to match where the regex is used in user
+    // code.
+    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+    return false;
+  }
+
+  bool is_valid = true;
+
+  // True iff ?, *, or + can follow the previous atom.
+  bool prev_repeatable = false;
+  for (int i = 0; regex[i]; i++) {
+    if (regex[i] == '\\') {  // An escape sequence
+      i++;
+      if (regex[i] == '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "'\\' cannot appear at the end.";
+        return false;
+      }
+
+      if (!IsValidEscape(regex[i])) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "invalid escape sequence \"\\" << regex[i] << "\".";
+        is_valid = false;
+      }
+      prev_repeatable = true;
+    } else {  // Not an escape sequence.
+      const char ch = regex[i];
+
+      if (ch == '^' && i > 0) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'^' can only appear at the beginning.";
+        is_valid = false;
+      } else if (ch == '$' && regex[i + 1] != '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'$' can only appear at the end.";
+        is_valid = false;
+      } else if (IsInSet(ch, "()[]{}|")) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' is unsupported.";
+        is_valid = false;
+      } else if (IsRepeat(ch) && !prev_repeatable) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' can only follow a repeatable token.";
+        is_valid = false;
+      }
+
+      prev_repeatable = !IsInSet(ch, "^$?*+");
+    }
+  }
+
+  return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression.  The regex atom is defined as c if escaped is false,
+// or \c otherwise.  repeat is the repetition meta character (?, *,
+// or +).  The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway.  We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char c, char repeat, const char* regex,
+    const char* str) {
+  const size_t min_count = (repeat == '+') ? 1 : 0;
+  const size_t max_count = (repeat == '?') ? 1 :
+      static_cast<size_t>(-1) - 1;
+  // We cannot call numeric_limits::max() as it conflicts with the
+  // max() macro on Windows.
+
+  for (size_t i = 0; i <= max_count; ++i) {
+    // We know that the atom matches each of the first i characters in str.
+    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+      // We have enough matches at the head, and the tail matches too.
+      // Since we only care about *whether* the pattern matches str
+      // (as opposed to *how* it matches), there is no need to find a
+      // greedy match.
+      return true;
+    }
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
+      return false;
+  }
+  return false;
+}
+
+// Returns true iff regex matches a prefix of str.  regex must be a
+// valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char* regex, const char* str) {
+  if (*regex == '\0')  // An empty regex matches a prefix of anything.
+    return true;
+
+  // "$" only matches the end of a string.  Note that regex being
+  // valid guarantees that there's nothing after "$" in it.
+  if (*regex == '$')
+    return *str == '\0';
+
+  // Is the first thing in regex an escape sequence?
+  const bool escaped = *regex == '\\';
+  if (escaped)
+    ++regex;
+  if (IsRepeat(regex[1])) {
+    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+    // here's an indirect recursion.  It terminates as the regex gets
+    // shorter in each recursion.
+    return MatchRepetitionAndRegexAtHead(
+        escaped, regex[0], regex[1], regex + 2, str);
+  } else {
+    // regex isn't empty, isn't "$", and doesn't start with a
+    // repetition.  We match the first atom of regex with the first
+    // character of str and recurse.
+    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+        MatchRegexAtHead(regex + 1, str + 1);
+  }
+}
+
+// Returns true iff regex matches any substring of str.  regex must be
+// a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally.  In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char* regex, const char* str) {
+  if (regex == NULL || str == NULL)
+    return false;
+
+  if (*regex == '^')
+    return MatchRegexAtHead(regex + 1, str);
+
+  // A successful match can be anywhere in str.
+  do {
+    if (MatchRegexAtHead(regex, str))
+      return true;
+  } while (*str++ != '\0');
+  return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+  free(const_cast<char*>(pattern_));
+  free(const_cast<char*>(full_pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = full_pattern_ = NULL;
+  if (regex != NULL) {
+    pattern_ = posix::StrDup(regex);
+  }
+
+  is_valid_ = ValidateRegex(regex);
+  if (!is_valid_) {
+    // No need to calculate the full pattern when the regex is invalid.
+    return;
+  }
+
+  const size_t len = strlen(regex);
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match: we need space to prepend a '^', append a '$', and
+  // terminate the string with '\0'.
+  char* buffer = static_cast<char*>(malloc(len + 3));
+  full_pattern_ = buffer;
+
+  if (*regex != '^')
+    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
+
+  // We don't use snprintf or strncpy, as they trigger a warning when
+  // compiled with VC++ 8.0.
+  memcpy(buffer, regex, len);
+  buffer += len;
+
+  if (len == 0 || regex[len - 1] != '$')
+    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
+
+  *buffer = '\0';
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0) {
+    return file_name + ":";
+  }
+#ifdef _MSC_VER
+  return file_name + "(" + StreamableToString(line) + "):";
+#else
+  return file_name + ":" + StreamableToString(line) + ":";
+#endif  // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
+    const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0)
+    return file_name;
+  else
+    return file_name + ":" + StreamableToString(line);
+}
+
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+    : severity_(severity) {
+  const char* const marker =
+      severity == GTEST_INFO ?    "[  INFO ]" :
+      severity == GTEST_WARNING ? "[WARNING]" :
+      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl << marker << " "
+              << FormatFileLocation(file, line).c_str() << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+  GetStream() << ::std::endl;
+  if (severity_ == GTEST_FATAL) {
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable: 4996)
+#endif  // _MSC_VER
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+  // The ctor redirects the stream to a temporary file.
+  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+# if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+
+    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+    const UINT success = ::GetTempFileNameA(temp_dir_path,
+                                            "gtest_redir",
+                                            0,  // Generate unique file name.
+                                            temp_file_path);
+    GTEST_CHECK_(success != 0)
+        << "Unable to create a temporary file in " << temp_dir_path;
+    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
+                                    << temp_file_path;
+    filename_ = temp_file_path;
+# else
+    // There's no guarantee that a test has write access to the current
+    // directory, so we create the temporary file in the /tmp directory
+    // instead. We use /tmp on most systems, and /sdcard on Android.
+    // That's because Android doesn't have /tmp.
+#  if GTEST_OS_LINUX_ANDROID
+    // Note: Android applications are expected to call the framework's
+    // Context.getExternalStorageDirectory() method through JNI to get
+    // the location of the world-writable SD Card directory. However,
+    // this requires a Context handle, which cannot be retrieved
+    // globally from native code. Doing so also precludes running the
+    // code as part of a regular standalone executable, which doesn't
+    // run in a Dalvik process (e.g. when running it through 'adb shell').
+    //
+    // The location /sdcard is directly accessible from native code
+    // and is the only location (unofficially) supported by the Android
+    // team. It's generally a symlink to the real SD Card mount point
+    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
+    // other OEM-customized locations. Never rely on these, and always
+    // use /sdcard.
+    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
+#  else
+    char name_template[] = "/tmp/captured_stream.XXXXXX";
+#  endif  // GTEST_OS_LINUX_ANDROID
+    const int captured_fd = mkstemp(name_template);
+    filename_ = name_template;
+# endif  // GTEST_OS_WINDOWS
+    fflush(NULL);
+    dup2(captured_fd, fd_);
+    close(captured_fd);
+  }
+
+  ~CapturedStream() {
+    remove(filename_.c_str());
+  }
+
+  std::string GetCapturedString() {
+    if (uncaptured_fd_ != -1) {
+      // Restores the original stream.
+      fflush(NULL);
+      dup2(uncaptured_fd_, fd_);
+      close(uncaptured_fd_);
+      uncaptured_fd_ = -1;
+    }
+
+    FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    const std::string content = ReadEntireFile(file);
+    posix::FClose(file);
+    return content;
+  }
+
+ private:
+  // Reads the entire content of a file as an std::string.
+  static std::string ReadEntireFile(FILE* file);
+
+  // Returns the size (in bytes) of a file.
+  static size_t GetFileSize(FILE* file);
+
+  const int fd_;  // A stream to capture.
+  int uncaptured_fd_;
+  // Name of the temporary file holding the stderr output.
+  ::std::string filename_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+};
+
+// Returns the size (in bytes) of a file.
+size_t CapturedStream::GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+// Reads the entire content of a file as a string.
+std::string CapturedStream::ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
+# ifdef _MSC_VER
+#  pragma warning(pop)
+# endif  // _MSC_VER
+
+static CapturedStream* g_captured_stderr = NULL;
+static CapturedStream* g_captured_stdout = NULL;
+
+// Starts capturing an output stream (stdout/stderr).
+void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
+  if (*stream != NULL) {
+    GTEST_LOG_(FATAL) << "Only one " << stream_name
+                      << " capturer can exist at a time.";
+  }
+  *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+std::string GetCapturedStream(CapturedStream** captured_stream) {
+  const std::string content = (*captured_stream)->GetCapturedString();
+
+  delete *captured_stream;
+  *captured_stream = NULL;
+
+  return content;
+}
+
+// Starts capturing stdout.
+void CaptureStdout() {
+  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+std::string GetCapturedStdout() {
+  return GetCapturedStream(&g_captured_stdout);
+}
+
+// Stops capturing stderr and returns the captured string.
+std::string GetCapturedStderr() {
+  return GetCapturedStream(&g_captured_stderr);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+#if GTEST_HAS_DEATH_TEST
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+::std::vector<testing::internal::string> g_argvs;
+
+static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
+                                        NULL;  // Owned.
+
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
+  if (g_injected_test_argvs != argvs)
+    delete g_injected_test_argvs;
+  g_injected_test_argvs = argvs;
+}
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
+  if (g_injected_test_argvs != NULL) {
+    return *g_injected_test_argvs;
+  }
+  return g_argvs;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+  DebugBreak();
+  TerminateProcess(GetCurrentProcess(), 1);
+}
+}  // namespace posix
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string full_flag =
+      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+  Message env_var;
+  for (size_t i = 0; i != full_flag.length(); i++) {
+    env_var << ToUpper(full_flag.c_str()[i]);
+  }
+
+  return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = NULL;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value \"" << str << "\".\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const Int32 result = static_cast<Int32>(long_value);
+  if (long_value == LONG_MAX || long_value == LONG_MIN ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an Int32.
+      ) {
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value " << str << ", which overflows.\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == NULL ?
+      default_value : strcmp(string_value, "0") != 0;
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == NULL) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  Int32 result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var,
+                  string_value, &result)) {
+    printf("The default value %s is used.\n",
+           (Message() << default_value).GetString().c_str());
+    fflush(stdout);
+    return default_value;
+  }
+
+  return result;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromGTestEnv(const char* flag, const char* default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value = posix::GetEnv(env_var.c_str());
+  return value == NULL ? default_value : value;
+}
+
+}  // namespace internal
+}  // namespace testing
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// It uses the << operator when possible, and prints the bytes in the
+// object otherwise.  A user can override its behavior for a class
+// type Foo by defining either operator<<(::std::ostream&, const Foo&)
+// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
+// defines Foo.
+
+#include <ctype.h>
+#include <stdio.h>
+#include <ostream>  // NOLINT
+#include <string>
+
+namespace testing {
+
+namespace {
+
+using ::std::ostream;
+
+// Prints a segment of bytes in the given object.
+void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
+                                size_t count, ostream* os) {
+  char text[5] = "";
+  for (size_t i = 0; i != count; i++) {
+    const size_t j = start + i;
+    if (i != 0) {
+      // Organizes the bytes into groups of 2 for easy parsing by
+      // human.
+      if ((j % 2) == 0)
+        *os << ' ';
+      else
+        *os << '-';
+    }
+    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
+    *os << text;
+  }
+}
+
+// Prints the bytes in the given value to the given ostream.
+void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
+                              ostream* os) {
+  // Tells the user how big the object is.
+  *os << count << "-byte object <";
+
+  const size_t kThreshold = 132;
+  const size_t kChunkSize = 64;
+  // If the object size is bigger than kThreshold, we'll have to omit
+  // some details by printing only the first and the last kChunkSize
+  // bytes.
+  // TODO(wan): let the user control the threshold using a flag.
+  if (count < kThreshold) {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
+  } else {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
+    *os << " ... ";
+    // Rounds up to 2-byte boundary.
+    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
+  }
+  *os << ">";
+}
+
+}  // namespace
+
+namespace internal2 {
+
+// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
+// given object.  The delegation simplifies the implementation, which
+// uses the << operator and thus is easier done outside of the
+// ::testing::internal namespace, which contains a << operator that
+// sometimes conflicts with the one in STL.
+void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
+                          ostream* os) {
+  PrintBytesInObjectToImpl(obj_bytes, count, os);
+}
+
+}  // namespace internal2
+
+namespace internal {
+
+// Depending on the value of a char (or wchar_t), we print it in one
+// of three formats:
+//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
+//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
+//   - as a special escape sequence (e.g. '\r', '\n').
+enum CharFormat {
+  kAsIs,
+  kHexEscape,
+  kSpecialEscape
+};
+
+// Returns true if c is a printable ASCII character.  We test the
+// value of c directly instead of calling isprint(), which is buggy on
+// Windows Mobile.
+inline bool IsPrintableAscii(wchar_t c) {
+  return 0x20 <= c && c <= 0x7E;
+}
+
+// Prints a wide or narrow char c as a character literal without the
+// quotes, escaping it when necessary; returns how c was formatted.
+// The template argument UnsignedChar is the unsigned version of Char,
+// which is the type of c.
+template <typename UnsignedChar, typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
+  switch (static_cast<wchar_t>(c)) {
+    case L'\0':
+      *os << "\\0";
+      break;
+    case L'\'':
+      *os << "\\'";
+      break;
+    case L'\\':
+      *os << "\\\\";
+      break;
+    case L'\a':
+      *os << "\\a";
+      break;
+    case L'\b':
+      *os << "\\b";
+      break;
+    case L'\f':
+      *os << "\\f";
+      break;
+    case L'\n':
+      *os << "\\n";
+      break;
+    case L'\r':
+      *os << "\\r";
+      break;
+    case L'\t':
+      *os << "\\t";
+      break;
+    case L'\v':
+      *os << "\\v";
+      break;
+    default:
+      if (IsPrintableAscii(c)) {
+        *os << static_cast<char>(c);
+        return kAsIs;
+      } else {
+        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
+        return kHexEscape;
+      }
+  }
+  return kSpecialEscape;
+}
+
+// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  switch (c) {
+    case L'\'':
+      *os << "'";
+      return kAsIs;
+    case L'"':
+      *os << "\\\"";
+      return kSpecialEscape;
+    default:
+      return PrintAsCharLiteralTo<wchar_t>(c, os);
+  }
+}
+
+// Prints a char c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+  return PrintAsStringLiteralTo(
+      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+}
+
+// Prints a wide or narrow character c and its code.  '\0' is printed
+// as "'\\0'", other unprintable characters are also properly escaped
+// using the standard C++ escape sequence.  The template argument
+// UnsignedChar is the unsigned version of Char, which is the type of c.
+template <typename UnsignedChar, typename Char>
+void PrintCharAndCodeTo(Char c, ostream* os) {
+  // First, print c as a literal in the most readable form we can find.
+  *os << ((sizeof(c) > 1) ? "L'" : "'");
+  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << "'";
+
+  // To aid user debugging, we also print c's code in decimal, unless
+  // it's 0 (in which case c was printed as '\\0', making the code
+  // obvious).
+  if (c == 0)
+    return;
+  *os << " (" << static_cast<int>(c);
+
+  // For more convenience, we print c's code again in hexidecimal,
+  // unless c was already printed in the form '\x##' or the code is in
+  // [1, 9].
+  if (format == kHexEscape || (1 <= c && c <= 9)) {
+    // Do nothing.
+  } else {
+    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
+  }
+  *os << ")";
+}
+
+void PrintTo(unsigned char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+void PrintTo(signed char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
+void PrintTo(wchar_t wc, ostream* os) {
+  PrintCharAndCodeTo<wchar_t>(wc, os);
+}
+
+// Prints the given array of characters to the ostream.  CharType must be either
+// char or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+static void PrintCharsAsStringTo(
+    const CharType* begin, size_t len, ostream* os) {
+  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
+  *os << kQuoteBegin;
+  bool is_previous_hex = false;
+  for (size_t index = 0; index < len; ++index) {
+    const CharType cur = begin[index];
+    if (is_previous_hex && IsXDigit(cur)) {
+      // Previous character is of '\x..' form and this character can be
+      // interpreted as another hexadecimal digit in its number. Break string to
+      // disambiguate.
+      *os << "\" " << kQuoteBegin;
+    }
+    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+  }
+  *os << "\"";
+}
+
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'.  CharType must be either char or wchar_t.
+template <typename CharType>
+static void UniversalPrintCharArray(
+    const CharType* begin, size_t len, ostream* os) {
+  // The code
+  //   const char kFoo[] = "foo";
+  // generates an array of 4, not 3, elements, with the last one being '\0'.
+  //
+  // Therefore when printing a char array, we don't print the last element if
+  // it's '\0', such that the output matches the string literal as it's
+  // written in the source code.
+  if (len > 0 && begin[len - 1] == '\0') {
+    PrintCharsAsStringTo(begin, len - 1, os);
+    return;
+  }
+
+  // If, however, the last element in the array is not '\0', e.g.
+  //    const char kFoo[] = { 'f', 'o', 'o' };
+  // we must print the entire array.  We also print a message to indicate
+  // that the array is not NUL-terminated.
+  PrintCharsAsStringTo(begin, len, os);
+  *os << " (no terminating NUL)";
+}
+
+// Prints a (const) char array of 'len' elements, starting at address 'begin'.
+void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints the given C string to the ostream.
+void PrintTo(const char* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, strlen(s), os);
+  }
+}
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Prints the given wide C string to the ostream.
+void PrintTo(const wchar_t* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, wcslen(s), os);
+  }
+}
+#endif  // wchar_t is native
+
+// Prints a ::string object.
+#if GTEST_HAS_GLOBAL_STRING
+void PrintStringTo(const ::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+void PrintStringTo(const ::std::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+// Prints a ::wstring object.
+#if GTEST_HAS_GLOBAL_WSTRING
+void PrintWideStringTo(const ::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+}  // namespace internal
+
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// The Google C++ Testing Framework (Google Test)
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+std::string TestPartResult::ExtractSummary(const char* message) {
+  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == NULL ? message :
+      std::string(message, stack_trace);
+}
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
+  return os
+      << result.file_name() << ":" << result.line_number() << ": "
+      << (result.type() == TestPartResult::kSuccess ? "Success" :
+          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
+          "Non-fatal failure") << ":\n"
+      << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult& result) {
+  array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+  if (index < 0 || index >= size()) {
+    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+    internal::posix::Abort();
+  }
+
+  return array_[index];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+  return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+    : has_new_fatal_failure_(false),
+      original_reporter_(GetUnitTestImpl()->
+                         GetTestPartResultReporterForCurrentThread()) {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+      original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+    const TestPartResult& result) {
+  if (result.fatally_failed())
+    has_new_fatal_failure_ = true;
+  original_reporter_->ReportTestPartResult(result);
+}
+
+}  // namespace internal
+
+}  // namespace testing
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+
+namespace testing {
+namespace internal {
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// Skips to the first non-space char in str. Returns an empty string if str
+// contains only whitespace characters.
+static const char* SkipSpaces(const char* str) {
+  while (IsSpace(*str))
+    str++;
+  return str;
+}
+
+// Verifies that registered_tests match the test names in
+// defined_test_names_; returns registered_tests if successful, or
+// aborts the program otherwise.
+const char* TypedTestCasePState::VerifyRegisteredTestNames(
+    const char* file, int line, const char* registered_tests) {
+  typedef ::std::set<const char*>::const_iterator DefinedTestIter;
+  registered_ = true;
+
+  // Skip initial whitespace in registered_tests since some
+  // preprocessors prefix stringizied literals with whitespace.
+  registered_tests = SkipSpaces(registered_tests);
+
+  Message errors;
+  ::std::set<std::string> tests;
+  for (const char* names = registered_tests; names != NULL;
+       names = SkipComma(names)) {
+    const std::string name = GetPrefixUntilComma(names);
+    if (tests.count(name) != 0) {
+      errors << "Test " << name << " is listed more than once.\n";
+      continue;
+    }
+
+    bool found = false;
+    for (DefinedTestIter it = defined_test_names_.begin();
+         it != defined_test_names_.end();
+         ++it) {
+      if (name == *it) {
+        found = true;
+        break;
+      }
+    }
+
+    if (found) {
+      tests.insert(name);
+    } else {
+      errors << "No test named " << name
+             << " can be found in this test case.\n";
+    }
+  }
+
+  for (DefinedTestIter it = defined_test_names_.begin();
+       it != defined_test_names_.end();
+       ++it) {
+    if (tests.count(*it) == 0) {
+      errors << "You forgot to list test " << *it << ".\n";
+    }
+  }
+
+  const std::string& errors_str = errors.GetString();
+  if (errors_str != "") {
+    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+            errors_str.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+
+  return registered_tests;
+}
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
diff --git a/packages/kokkos/tpls/gtest/gtest/gtest-test-part.h b/packages/kokkos/tpls/gtest/gtest/gtest-test-part.h
new file mode 120000
index 0000000000000000000000000000000000000000..48d39090f1cabfc4a852d54e0e1f186362eeb1f5
--- /dev/null
+++ b/packages/kokkos/tpls/gtest/gtest/gtest-test-part.h
@@ -0,0 +1 @@
+gtest.h
\ No newline at end of file
diff --git a/packages/kokkos/tpls/gtest/gtest/gtest.h b/packages/kokkos/tpls/gtest/gtest/gtest.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f98f330ed8d5df6400e0e654d01ed897ff65093
--- /dev/null
+++ b/packages/kokkos/tpls/gtest/gtest/gtest.h
@@ -0,0 +1,20065 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for Google Test.  It should be
+// included by any test program that uses Google Test.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+//
+// Acknowledgment: Google Test borrowed the idea of automatic test
+// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
+// easyUnit framework.
+
+#ifdef __GNUC__
+#pragma GCC system_header
+#endif
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_H_
+
+#include <limits>
+#include <ostream>
+#include <vector>
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan)
+//
+// Low-level types and utilities for porting Google Test to various
+// platforms.  They are subject to change without notice.  DO NOT USE
+// THEM IN USER CODE.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// The user can define the following macros in the build script to
+// control Google Test's behavior.  If the user doesn't define a macro
+// in this list, Google Test will define it.
+//
+//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
+//                              is/isn't available.
+//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
+//                              are enabled.
+//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::string, which is different to std::string).
+//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::wstring, which is different to std::wstring).
+//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
+//                              expressions are/aren't available.
+//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
+//                              is/isn't available.
+//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
+//                              enabled.
+//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
+//                              std::wstring does/doesn't work (Google Test can
+//                              be used where std::wstring is unavailable).
+//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
+//                              is/isn't available.
+//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
+//                              compiler supports Microsoft's "Structured
+//                              Exception Handling".
+//   GTEST_HAS_STREAM_REDIRECTION
+//                            - Define it to 1/0 to indicate whether the
+//                              platform supports I/O stream redirection using
+//                              dup() and dup2().
+//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
+//                              Test's own tr1 tuple implementation should be
+//                              used.  Unused when the user sets
+//                              GTEST_HAS_TR1_TUPLE to 0.
+//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
+//                              is building in C++11/C++98 mode.
+//   GTEST_LINKED_AS_SHARED_LIBRARY
+//                            - Define to 1 when compiling tests that use
+//                              Google Test as a shared library (known as
+//                              DLL on Windows).
+//   GTEST_CREATE_SHARED_LIBRARY
+//                            - Define to 1 when compiling Google Test itself
+//                              as a shared library.
+
+// This header defines the following utilities:
+//
+// Macros indicating the current platform (defined to 1 if compiled on
+// the given platform; otherwise undefined):
+//   GTEST_OS_AIX      - IBM AIX
+//   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_HPUX     - HP-UX
+//   GTEST_OS_LINUX    - Linux
+//     GTEST_OS_LINUX_ANDROID - Google Android
+//   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
+//       GTEST_OS_IOS_SIMULATOR - iOS simulator
+//   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_QNX      - QNX
+//   GTEST_OS_SOLARIS  - Sun Solaris
+//   GTEST_OS_SYMBIAN  - Symbian
+//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
+//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
+//     GTEST_OS_WINDOWS_MINGW    - MinGW
+//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//   GTEST_OS_ZOS      - z/OS
+//
+// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
+// most stable support.  Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable.  If you notice any problems on your platform, please notify
+// googletestframework@googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// Note that it is possible that none of the GTEST_OS_* macros are defined.
+//
+// Macros indicating available Google Test features (defined to 1 if
+// the corresponding feature is supported; otherwise undefined):
+//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
+//                            tests)
+//   GTEST_HAS_DEATH_TEST   - death tests
+//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
+//   GTEST_HAS_TYPED_TEST   - typed tests
+//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
+//                            GTEST_HAS_POSIX_RE (see above) which users can
+//                            define themselves.
+//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
+//                            the above two are mutually exclusive.
+//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
+//
+// Macros for basic C++ coding:
+//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
+//                              variable don't have to be used.
+//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
+//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
+//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//
+// Synchronization:
+//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
+//                  - synchronization primitives.
+//   GTEST_IS_THREADSAFE - defined to 1 to indicate that the above
+//                         synchronization primitives have real implementations
+//                         and Google Test is thread-safe; or 0 otherwise.
+//
+// Template meta programming:
+//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
+//   IteratorTraits - partial implementation of std::iterator_traits, which
+//                    is not available in libCstd when compiled with Sun C++.
+//
+// Smart pointers:
+//   scoped_ptr     - as in TR2.
+//
+// Regular expressions:
+//   RE             - a simple regular expression class using the POSIX
+//                    Extended Regular Expression syntax on UNIX-like
+//                    platforms, or a reduced regular exception syntax on
+//                    other platforms, including Windows.
+//
+// Logging:
+//   GTEST_LOG_()   - logs messages at the specified severity level.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+//   CaptureStdout()     - starts capturing stdout.
+//   GetCapturedStdout() - stops capturing stdout and returns the captured
+//                         string.
+//   CaptureStderr()     - starts capturing stderr.
+//   GetCapturedStderr() - stops capturing stderr and returns the captured
+//                         string.
+//
+// Integer types:
+//   TypeWithSize   - maps an integer to a int type.
+//   Int32, UInt32, Int64, UInt64, TimeInMillis
+//                  - integers of known sizes.
+//   BiggestInt     - the biggest signed integer type.
+//
+// Command-line utilities:
+//   GTEST_FLAG()       - references a flag.
+//   GTEST_DECLARE_*()  - declares a flag.
+//   GTEST_DEFINE_*()   - defines a flag.
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+//   GetEnv()             - gets the value of an environment variable.
+//   BoolFromGTestEnv()   - parses a bool environment variable.
+//   Int32FromGTestEnv()  - parses an Int32 environment variable.
+//   StringFromGTestEnv() - parses a string environment variable.
+
+#include <ctype.h>   // for isspace, etc
+#include <stddef.h>  // for ptrdiff_t
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifndef _WIN32_WCE
+# include <sys/types.h>
+# include <sys/stat.h>
+#endif  // !_WIN32_WCE
+
+#if defined __APPLE__
+# include <AvailabilityMacros.h>
+# include <TargetConditionals.h>
+#endif
+
+#include <iostream>  // NOLINT
+#include <sstream>  // NOLINT
+#include <string>  // NOLINT
+
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+# define GTEST_OS_CYGWIN 1
+#elif defined __SYMBIAN32__
+# define GTEST_OS_SYMBIAN 1
+#elif defined _WIN32
+# define GTEST_OS_WINDOWS 1
+# ifdef _WIN32_WCE
+#  define GTEST_OS_WINDOWS_MOBILE 1
+# elif defined(__MINGW__) || defined(__MINGW32__)
+#  define GTEST_OS_WINDOWS_MINGW 1
+# else
+#  define GTEST_OS_WINDOWS_DESKTOP 1
+# endif  // _WIN32_WCE
+#elif defined __APPLE__
+# define GTEST_OS_MAC 1
+# if TARGET_OS_IPHONE
+#  define GTEST_OS_IOS 1
+#  if TARGET_IPHONE_SIMULATOR
+#   define GTEST_OS_IOS_SIMULATOR 1
+#  endif
+# endif
+#elif defined __linux__
+# define GTEST_OS_LINUX 1
+# if defined __ANDROID__
+#  define GTEST_OS_LINUX_ANDROID 1
+# endif
+#elif defined __MVS__
+# define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+# define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+# define GTEST_OS_AIX 1
+#elif defined(__hpux)
+# define GTEST_OS_HPUX 1
+#elif defined __native_client__
+# define GTEST_OS_NACL 1
+#elif defined __OpenBSD__
+# define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+# define GTEST_OS_QNX 1
+#endif  // __CYGWIN__
+
+#ifndef GTEST_LANG_CXX11
+// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
+// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
+// value for __cplusplus, and recent versions of clang, gcc, and
+// probably other compilers set that too in C++11 mode.
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
+// Compiling in at least C++11 mode.
+#  define GTEST_LANG_CXX11 1
+# else
+#  define GTEST_LANG_CXX11 0
+# endif
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if !GTEST_OS_WINDOWS
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+# include <unistd.h>
+# include <strings.h>
+#elif !GTEST_OS_WINDOWS_MOBILE
+# include <direct.h>
+# include <io.h>
+#endif
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#  include <android/api-level.h>  // NOLINT
+#endif
+
+// Defines this to true iff Google Test can use POSIX regular expressions.
+#ifndef GTEST_HAS_POSIX_RE
+# if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+# else
+#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+# endif
+#endif
+
+#if GTEST_HAS_POSIX_RE
+
+// On some platforms, <regex.h> needs someone to define size_t, and
+// won't compile otherwise.  We can #include it here as we already
+// included <stdlib.h>, which is guaranteed to define size_t through
+// <stddef.h>.
+# include <regex.h>  // NOLINT
+
+# define GTEST_USES_POSIX_RE 1
+
+#elif GTEST_OS_WINDOWS
+
+// <regex.h> is not available on Windows.  Use our own simple regex
+// implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#else
+
+// <regex.h> may not be available on this platform.  Use our own
+// simple regex implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#endif  // GTEST_HAS_POSIX_RE
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#  ifndef _HAS_EXCEPTIONS
+#   define _HAS_EXCEPTIONS 1
+#  endif  // _HAS_EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+# elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions.  However, there is no compile-time way of
+// detecting whether they are enabled or not.  Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#  define GTEST_HAS_EXCEPTIONS 1
+# else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#  define GTEST_HAS_EXCEPTIONS 0
+# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#if !defined(GTEST_HAS_STD_STRING)
+// Even though we don't use this macro any longer, we keep it in case
+// some clients still depend on it.
+# define GTEST_HAS_STD_STRING 1
+#elif !GTEST_HAS_STD_STRING
+// The user told us that ::std::string isn't available.
+# error "Google Test cannot be used where ::std::string isn't available."
+#endif  // !defined(GTEST_HAS_STD_STRING)
+
+#ifndef GTEST_HAS_GLOBAL_STRING
+// The user didn't tell us whether ::string is available, so we need
+// to figure it out.
+
+# define GTEST_HAS_GLOBAL_STRING 0
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
+//   is available.
+
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either.  Android has
+// no support for it at least as recent as Froyo (2.2).
+# define GTEST_HAS_STD_WSTRING \
+    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+#ifndef GTEST_HAS_GLOBAL_WSTRING
+// The user didn't tell us whether ::wstring is available, so we need
+// to figure it out.
+# define GTEST_HAS_GLOBAL_WSTRING \
+    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+# ifdef _MSC_VER
+
+#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
+
+#  ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
+       !defined(__EXCEPTIONS)
+#    define GTEST_HAS_RTTI 0
+#   else
+#    define GTEST_HAS_RTTI 1
+#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif  // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+# elif defined(__clang__)
+
+#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#  ifdef __RTTI_ALL__
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+# else
+
+// For all other compilers, we assume RTTI is enabled.
+#  define GTEST_HAS_RTTI 1
+
+# endif  // _MSC_VER
+
+#endif  // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+# include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we assume pthreads support is
+// available on Linux and Mac.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
+    || GTEST_OS_QNX)
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+# include <pthread.h>  // NOLINT
+
+// For timespec and nanosleep, used below.
+# include <time.h>  // NOLINT
+#endif
+
+// Determines whether Google Test can use tr1/tuple.  You can define
+// this macro to 0 to prevent Google Test from using tuple (any
+// feature depending on tuple with be disabled in this mode).
+#ifndef GTEST_HAS_TR1_TUPLE
+# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
+// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
+#  define GTEST_HAS_TR1_TUPLE 0
+# else
+// The user didn't tell us not to do it, so we assume it's OK.
+#  define GTEST_HAS_TR1_TUPLE 1
+# endif
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether Google Test's own tr1 tuple implementation
+// should be used.
+#ifndef GTEST_USE_OWN_TR1_TUPLE
+// The user didn't tell us, so we need to figure it out.
+
+// We use our own TR1 tuple if we aren't sure the user has an
+// implementation of it already.  At this time, libstdc++ 4.0.0+ and
+// MSVC 2010 are the only mainstream standard libraries that come
+// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
+// pretends to be GCC by defining __GNUC__ and friends, but cannot
+// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
+// tuple in a 323 MB Feature Pack download, which we cannot assume the
+// user has.  QNX's QCC compiler is a modified GCC but it doesn't
+// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
+// and it can be used with some compilers that define __GNUC__.
+# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
+#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
+# endif
+
+// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
+// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
+// can build with clang but need to use gcc4.2's libstdc++).
+# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
+#  define GTEST_ENV_HAS_STD_TUPLE_ 1
+# endif
+
+# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
+#  define GTEST_USE_OWN_TR1_TUPLE 0
+# else
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# endif
+
+#endif  // GTEST_USE_OWN_TR1_TUPLE
+
+// To avoid conditional compilation everywhere, we make it
+// gtest-port.h's responsibility to #include the header implementing
+// tr1/tuple.
+#if GTEST_HAS_TR1_TUPLE
+
+# if GTEST_USE_OWN_TR1_TUPLE
+// This file was GENERATED by command:
+//     pump.py gtest-tuple.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2009 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+
+#include <utility>  // For ::std::pair.
+
+// The compiler used in Symbian has a bug that prevents us from declaring the
+// tuple template as a friend (it complains that tuple is redefined).  This
+// hack bypasses the bug by declaring the members that should otherwise be
+// private as public.
+// Sun Studio versions < 12 also have the above bug.
+#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
+#else
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
+    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
+   private:
+#endif
+
+// GTEST_n_TUPLE_(T) is the type of an n-tuple.
+#define GTEST_0_TUPLE_(T) tuple<>
+#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
+    void, void, void>
+#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
+    void, void, void>
+#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
+    void, void, void>
+#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
+    void, void, void>
+#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    void, void, void>
+#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, void, void>
+#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, void>
+#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, T##9>
+
+// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
+#define GTEST_0_TYPENAMES_(T)
+#define GTEST_1_TYPENAMES_(T) typename T##0
+#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
+#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
+#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3
+#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4
+#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5
+#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6
+#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
+#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8
+#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8, typename T##9
+
+// In theory, defining stuff in the ::std namespace is undefined
+// behavior.  We can do this as we are playing the role of a standard
+// library vendor.
+namespace std {
+namespace tr1 {
+
+template <typename T0 = void, typename T1 = void, typename T2 = void,
+    typename T3 = void, typename T4 = void, typename T5 = void,
+    typename T6 = void, typename T7 = void, typename T8 = void,
+    typename T9 = void>
+class tuple;
+
+// Anything in namespace gtest_internal is Google Test's INTERNAL
+// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
+namespace gtest_internal {
+
+// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
+template <typename T>
+struct ByRef { typedef const T& type; };  // NOLINT
+template <typename T>
+struct ByRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for ByRef.
+#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
+
+// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
+// is the same as tr1::add_reference<T>::type.
+template <typename T>
+struct AddRef { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for AddRef.
+#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
+
+// A helper for implementing get<k>().
+template <int k> class Get;
+
+// A helper for implementing tuple_element<k, T>.  kIndexValid is true
+// iff k < the number of fields in tuple type T.
+template <bool kIndexValid, int kIndex, class Tuple>
+struct TupleElement;
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
+  typedef T0 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
+  typedef T1 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
+  typedef T2 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
+  typedef T3 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
+  typedef T4 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
+  typedef T5 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
+  typedef T6 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
+  typedef T7 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
+  typedef T8 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
+  typedef T9 type;
+};
+
+}  // namespace gtest_internal
+
+template <>
+class tuple<> {
+ public:
+  tuple() {}
+  tuple(const tuple& /* t */)  {}
+  tuple& operator=(const tuple& /* t */) { return *this; }
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+class GTEST_1_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
+
+  tuple(const tuple& t) : f0_(t.f0_) {}
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    return *this;
+  }
+
+  T0 f0_;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+class GTEST_2_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
+      f1_(f1) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
+  template <typename U0, typename U1>
+  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+  template <typename U0, typename U1>
+  tuple& operator=(const ::std::pair<U0, U1>& p) {
+    f0_ = p.first;
+    f1_ = p.second;
+    return *this;
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+class GTEST_3_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+class GTEST_4_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+class GTEST_5_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
+      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_) {}
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+class GTEST_6_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_) {}
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+class GTEST_7_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+class GTEST_8_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
+      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+class GTEST_9_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+class tuple {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
+      f9_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
+      f9_(t.f9_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    f9_ = t.f9_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+  T9 f9_;
+};
+
+// 6.1.3.2 Tuple creation functions.
+
+// Known limitations: we don't support passing an
+// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
+// implement tie().
+
+inline tuple<> make_tuple() { return tuple<>(); }
+
+template <GTEST_1_TYPENAMES_(T)>
+inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
+  return GTEST_1_TUPLE_(T)(f0);
+}
+
+template <GTEST_2_TYPENAMES_(T)>
+inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
+  return GTEST_2_TUPLE_(T)(f0, f1);
+}
+
+template <GTEST_3_TYPENAMES_(T)>
+inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
+  return GTEST_3_TUPLE_(T)(f0, f1, f2);
+}
+
+template <GTEST_4_TYPENAMES_(T)>
+inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3) {
+  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
+}
+
+template <GTEST_5_TYPENAMES_(T)>
+inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4) {
+  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
+}
+
+template <GTEST_6_TYPENAMES_(T)>
+inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5) {
+  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
+}
+
+template <GTEST_7_TYPENAMES_(T)>
+inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
+  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
+}
+
+template <GTEST_8_TYPENAMES_(T)>
+inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
+  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
+}
+
+template <GTEST_9_TYPENAMES_(T)>
+inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8) {
+  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
+}
+
+template <GTEST_10_TYPENAMES_(T)>
+inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8, const T9& f9) {
+  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
+}
+
+// 6.1.3.3 Tuple helper classes.
+
+template <typename Tuple> struct tuple_size;
+
+template <GTEST_0_TYPENAMES_(T)>
+struct tuple_size<GTEST_0_TUPLE_(T) > {
+  static const int value = 0;
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+struct tuple_size<GTEST_1_TUPLE_(T) > {
+  static const int value = 1;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+struct tuple_size<GTEST_2_TUPLE_(T) > {
+  static const int value = 2;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+struct tuple_size<GTEST_3_TUPLE_(T) > {
+  static const int value = 3;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+struct tuple_size<GTEST_4_TUPLE_(T) > {
+  static const int value = 4;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+struct tuple_size<GTEST_5_TUPLE_(T) > {
+  static const int value = 5;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+struct tuple_size<GTEST_6_TUPLE_(T) > {
+  static const int value = 6;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+struct tuple_size<GTEST_7_TUPLE_(T) > {
+  static const int value = 7;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+struct tuple_size<GTEST_8_TUPLE_(T) > {
+  static const int value = 8;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+struct tuple_size<GTEST_9_TUPLE_(T) > {
+  static const int value = 9;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct tuple_size<GTEST_10_TUPLE_(T) > {
+  static const int value = 10;
+};
+
+template <int k, class Tuple>
+struct tuple_element {
+  typedef typename gtest_internal::TupleElement<
+      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
+};
+
+#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
+
+// 6.1.3.4 Element access.
+
+namespace gtest_internal {
+
+template <>
+class Get<0> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  Field(Tuple& t) { return t.f0_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  ConstField(const Tuple& t) { return t.f0_; }
+};
+
+template <>
+class Get<1> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  Field(Tuple& t) { return t.f1_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  ConstField(const Tuple& t) { return t.f1_; }
+};
+
+template <>
+class Get<2> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  Field(Tuple& t) { return t.f2_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  ConstField(const Tuple& t) { return t.f2_; }
+};
+
+template <>
+class Get<3> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  Field(Tuple& t) { return t.f3_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  ConstField(const Tuple& t) { return t.f3_; }
+};
+
+template <>
+class Get<4> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  Field(Tuple& t) { return t.f4_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  ConstField(const Tuple& t) { return t.f4_; }
+};
+
+template <>
+class Get<5> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  Field(Tuple& t) { return t.f5_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  ConstField(const Tuple& t) { return t.f5_; }
+};
+
+template <>
+class Get<6> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  Field(Tuple& t) { return t.f6_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  ConstField(const Tuple& t) { return t.f6_; }
+};
+
+template <>
+class Get<7> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  Field(Tuple& t) { return t.f7_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  ConstField(const Tuple& t) { return t.f7_; }
+};
+
+template <>
+class Get<8> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  Field(Tuple& t) { return t.f8_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  ConstField(const Tuple& t) { return t.f8_; }
+};
+
+template <>
+class Get<9> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  Field(Tuple& t) { return t.f9_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  ConstField(const Tuple& t) { return t.f9_; }
+};
+
+}  // namespace gtest_internal
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
+get(GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::Field(t);
+}
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
+get(const GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::ConstField(t);
+}
+
+// 6.1.3.5 Relational operators
+
+// We only implement == and !=, as we don't have a need for the rest yet.
+
+namespace gtest_internal {
+
+// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
+// first k fields of t1 equals the first k fields of t2.
+// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
+// k1 != k2.
+template <int kSize1, int kSize2>
+struct SameSizeTuplePrefixComparator;
+
+template <>
+struct SameSizeTuplePrefixComparator<0, 0> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
+    return true;
+  }
+};
+
+template <int k>
+struct SameSizeTuplePrefixComparator<k, k> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
+    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
+        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
+  }
+};
+
+}  // namespace gtest_internal
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator==(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) {
+  return gtest_internal::SameSizeTuplePrefixComparator<
+      tuple_size<GTEST_10_TUPLE_(T) >::value,
+      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
+}
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
+
+// 6.1.4 Pairs.
+// Unimplemented.
+
+}  // namespace tr1
+}  // namespace std
+
+#undef GTEST_0_TUPLE_
+#undef GTEST_1_TUPLE_
+#undef GTEST_2_TUPLE_
+#undef GTEST_3_TUPLE_
+#undef GTEST_4_TUPLE_
+#undef GTEST_5_TUPLE_
+#undef GTEST_6_TUPLE_
+#undef GTEST_7_TUPLE_
+#undef GTEST_8_TUPLE_
+#undef GTEST_9_TUPLE_
+#undef GTEST_10_TUPLE_
+
+#undef GTEST_0_TYPENAMES_
+#undef GTEST_1_TYPENAMES_
+#undef GTEST_2_TYPENAMES_
+#undef GTEST_3_TYPENAMES_
+#undef GTEST_4_TYPENAMES_
+#undef GTEST_5_TYPENAMES_
+#undef GTEST_6_TYPENAMES_
+#undef GTEST_7_TYPENAMES_
+#undef GTEST_8_TYPENAMES_
+#undef GTEST_9_TYPENAMES_
+#undef GTEST_10_TYPENAMES_
+
+#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
+#undef GTEST_BY_REF_
+#undef GTEST_ADD_REF_
+#undef GTEST_TUPLE_ELEMENT_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+# elif GTEST_ENV_HAS_STD_TUPLE_
+#  include <tuple>
+// C++11 puts its tuple into the ::std namespace rather than
+// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
+// This causes undefined behavior, but supported compilers react in
+// the way we intend.
+namespace std {
+namespace tr1 {
+using ::std::get;
+using ::std::make_tuple;
+using ::std::tuple;
+using ::std::tuple_element;
+using ::std::tuple_size;
+}
+}
+
+# elif GTEST_OS_SYMBIAN
+
+// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
+// use STLport's tuple implementation, which unfortunately doesn't
+// work as the copy of STLport distributed with Symbian is incomplete.
+// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
+// use its own tuple implementation.
+#  ifdef BOOST_HAS_TR1_TUPLE
+#   undef BOOST_HAS_TR1_TUPLE
+#  endif  // BOOST_HAS_TR1_TUPLE
+
+// This prevents <boost/tr1/detail/config.hpp>, which defines
+// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
+#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
+#  include <tuple>
+
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
+// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
+// not conform to the TR1 spec, which requires the header to be <tuple>.
+
+#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
+// which is #included by <tr1/tuple>, to not compile when RTTI is
+// disabled.  _TR1_FUNCTIONAL is the header guard for
+// <tr1/functional>.  Hence the following #define is a hack to prevent
+// <tr1/functional> from being included.
+#   define _TR1_FUNCTIONAL 1
+#   include <tr1/tuple>
+#   undef _TR1_FUNCTIONAL  // Allows the user to #include
+                        // <tr1/functional> if he chooses to.
+#  else
+#   include <tr1/tuple>  // NOLINT
+#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+
+# else
+// If the compiler is not GCC 4.0+, we assume the user is using a
+// spec-conforming TR1 implementation.
+#  include <tuple>  // NOLINT
+# endif  // GTEST_USE_OWN_TR1_TUPLE
+
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+# if GTEST_OS_LINUX && !defined(__ia64__)
+#  if GTEST_OS_LINUX_ANDROID
+// On Android, clone() is only available on ARM starting with Gingerbread.
+#    if defined(__arm__) && __ANDROID_API__ >= 9
+#     define GTEST_HAS_CLONE 1
+#    else
+#     define GTEST_HAS_CLONE 0
+#    endif
+#  else
+#   define GTEST_HAS_CLONE 1
+#  endif
+# else
+#  define GTEST_HAS_CLONE 0
+# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif  // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN
+#  define GTEST_HAS_STREAM_REDIRECTION 0
+# else
+#  define GTEST_HAS_STREAM_REDIRECTION 1
+# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// Google Test does not support death tests for VC 7.1 and earlier as
+// abort() in a VC 7.1 application compiled as GUI in debug config
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) || GTEST_OS_IOS_SIMULATOR || \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
+     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
+     GTEST_OS_OPENBSD || GTEST_OS_QNX)
+# define GTEST_HAS_DEATH_TEST 1
+# include <vector>  // NOLINT
+#endif
+
+// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
+// all the compilers we care about are adequate for supporting
+// value-parameterized tests.
+#define GTEST_HAS_PARAM_TEST 1
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
+    defined(__IBMCPP__) || defined(__HP_aCC)
+# define GTEST_HAS_TYPED_TEST 1
+# define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether to support Combine(). This only makes sense when
+// value-parameterized tests are enabled.  The implementation doesn't
+// work on Sun Studio since it doesn't understand templated conversion
+// operators.
+#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
+# define GTEST_HAS_COMBINE 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX
+# define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding.  This leads to problems with code like:
+//
+//   if (gate)
+//     ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used.  This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor.  Example:
+//
+//   struct Foo {
+//     Foo() { ... }
+//   } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#else
+# define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+// A macro to disallow operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_ASSIGN_(type)\
+  void operator=(type const &)
+
+// A macro to disallow copy constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
+  type(type const &);\
+  GTEST_DISALLOW_ASSIGN_(type)
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro.  The macro should be used on function declarations
+// following the argument list:
+//
+//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
+# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#else
+# define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling.  This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#  define GTEST_HAS_SEH 1
+# else
+// Assume no SEH.
+#  define GTEST_HAS_SEH 0
+# endif
+
+#endif  // GTEST_HAS_SEH
+
+#ifdef _MSC_VER
+
+# if GTEST_LINKED_AS_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllimport)
+# elif GTEST_CREATE_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllexport)
+# endif
+
+#endif  // _MSC_VER
+
+#ifndef GTEST_API_
+# define GTEST_API_
+#endif
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+# define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+# define GTEST_NO_INLINE_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
+# define GTEST_HAS_CXXABI_H_ 1
+#else
+# define GTEST_HAS_CXXABI_H_ 0
+#endif
+
+namespace testing {
+
+class Message;
+
+namespace internal {
+
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
+//                         content_type_names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+template <bool>
+struct CompileAssert {
+};
+
+#define GTEST_COMPILE_ASSERT_(expr, msg) \
+  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
+      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
+
+// Implementation details of GTEST_COMPILE_ASSERT_:
+//
+// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
+//                                      // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
+//
+// This template is declared, but intentionally undefined.
+template <typename T1, typename T2>
+struct StaticAssertTypeEqHelper;
+
+template <typename T>
+struct StaticAssertTypeEqHelper<T, T> {};
+
+#if GTEST_HAS_GLOBAL_STRING
+typedef ::string string;
+#else
+typedef ::std::string string;
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+typedef ::wstring wstring;
+#elif GTEST_HAS_STD_WSTRING
+typedef ::std::wstring wstring;
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// A helper for suppressing warnings on constant condition.  It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines scoped_ptr.
+
+// This implementation of scoped_ptr is PARTIAL - it only contains
+// enough stuff to satisfy Google Test's need.
+template <typename T>
+class scoped_ptr {
+ public:
+  typedef T element_type;
+
+  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
+  ~scoped_ptr() { reset(); }
+
+  T& operator*() const { return *ptr_; }
+  T* operator->() const { return ptr_; }
+  T* get() const { return ptr_; }
+
+  T* release() {
+    T* const ptr = ptr_;
+    ptr_ = NULL;
+    return ptr;
+  }
+
+  void reset(T* p = NULL) {
+    if (p != ptr_) {
+      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
+        delete ptr_;
+      }
+      ptr_ = p;
+    }
+  }
+
+ private:
+  T* ptr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
+};
+
+// Defines RE.
+
+// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+  // A copy constructor is required by the Standard to initialize object
+  // references from r-values.
+  RE(const RE& other) { Init(other.pattern()); }
+
+  // Constructs an RE from a string.
+  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  RE(const char* regex) { Init(regex); }  // NOLINT
+  ~RE();
+
+  // Returns the string representation of the regex.
+  const char* pattern() const { return pattern_; }
+
+  // FullMatch(str, re) returns true iff regular expression re matches
+  // the entire str.
+  // PartialMatch(str, re) returns true iff regular expression re
+  // matches a substring of str (including str itself).
+  //
+  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
+  // when str contains NUL characters.
+  static bool FullMatch(const ::std::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::std::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const ::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const char* str, const RE& re);
+  static bool PartialMatch(const char* str, const RE& re);
+
+ private:
+  void Init(const char* regex);
+
+  // We use a const char* instead of an std::string, as Google Test used to be
+  // used where std::string is not available.  TODO(wan@google.com): change to
+  // std::string.
+  const char* pattern_;
+  bool is_valid_;
+
+#if GTEST_USES_POSIX_RE
+
+  regex_t full_regex_;     // For FullMatch().
+  regex_t partial_regex_;  // For PartialMatch().
+
+#else  // GTEST_USES_SIMPLE_RE
+
+  const char* full_pattern_;  // For FullMatch();
+
+#endif
+
+  GTEST_DISALLOW_ASSIGN_(RE);
+};
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line);
+
+// Defines logging utilities:
+//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
+//                          message itself is streamed into the macro.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity {
+  GTEST_INFO,
+  GTEST_WARNING,
+  GTEST_ERROR,
+  GTEST_FATAL
+};
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+  GTestLog(GTestLogSeverity severity, const char* file, int line);
+
+  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+  ~GTestLog();
+
+  ::std::ostream& GetStream() { return ::std::cerr; }
+
+ private:
+  const GTestLogSeverity severity_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+};
+
+#define GTEST_LOG_(severity) \
+    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                  __FILE__, __LINE__).GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(NULL); }
+
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+//  Synopsys:
+//    GTEST_CHECK_(boolean_condition);
+//     or
+//    GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+//    This checks the condition and if the condition is not satisfied
+//    it prints message about the condition violation, including the
+//    condition itself, plus additional message streamed into it, if any,
+//    and then it aborts the program. It aborts the program irrespective of
+//    whether it is built in the debug mode or not.
+#define GTEST_CHECK_(condition) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::IsTrue(condition)) \
+      ; \
+    else \
+      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success).  Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+  if (const int gtest_error = (posix_call)) \
+    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
+                      << gtest_error
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*).  When you use ImplicitCast_, the compiler checks that
+// the cast is safe.  Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertable to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+//   ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To>
+inline To ImplicitCast_(To x) { return x; }
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed.  When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo?  It
+// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
+// when you downcast, you should use this macro.  In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not).  In normal mode, we do the efficient static_cast<>
+// instead.  Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+//    This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {  // so we only accept pointers
+  // Ensures that To is a sub-type of From *.  This test is here only
+  // for compile-time type checking, and has no overhead in an
+  // optimized build at run-time, as it will be optimized away
+  // completely.
+  if (false) {
+    const To to = NULL;
+    ::testing::internal::ImplicitCast_<From*>(to);
+  }
+
+#if GTEST_HAS_RTTI
+  // RTTI: debug mode only!
+  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
+#endif
+  return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived* CheckedDowncastToActualType(Base* base) {
+#if GTEST_HAS_RTTI
+  GTEST_CHECK_(typeid(*base) == typeid(Derived));
+  return dynamic_cast<Derived*>(base);  // NOLINT
+#else
+  return static_cast<Derived*>(base);  // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+//   CaptureStdout     - starts capturing stdout.
+//   GetCapturedStdout - stops capturing stdout and returns the captured string.
+//   CaptureStderr     - starts capturing stderr.
+//   GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+
+#if GTEST_HAS_DEATH_TEST
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs();
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
+                             new_argvs);
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+extern ::std::vector<testing::internal::string> g_argvs;
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+
+#if GTEST_HAS_PTHREAD
+
+// Sleeps for (roughly) n milli-seconds.  This function is only for
+// testing Google Test's own constructs.  Don't use it in user tests,
+// either directly or indirectly.
+inline void SleepMilliseconds(int n) {
+  const timespec time = {
+    0,                  // 0 seconds.
+    n * 1000L * 1000L,  // And n ms.
+  };
+  nanosleep(&time, NULL);
+}
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class Notification {
+ public:
+  Notification() : notified_(false) {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+  }
+  ~Notification() {
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    pthread_mutex_lock(&mutex_);
+    notified_ = true;
+    pthread_mutex_unlock(&mutex_);
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    for (;;) {
+      pthread_mutex_lock(&mutex_);
+      const bool notified = notified_;
+      pthread_mutex_unlock(&mutex_);
+      if (notified)
+        break;
+      SleepMilliseconds(10);
+    }
+  }
+
+ private:
+  pthread_mutex_t mutex_;
+  bool notified_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+  virtual ~ThreadWithParamBase() {}
+  virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical.  Some compilers (for
+// example, SunStudio) treat them as different types.  Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+  static_cast<ThreadWithParamBase*>(thread)->Run();
+  return NULL;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+//   void ThreadFunc(int param) { /* Do things with param */ }
+//   Notification thread_can_start;
+//   ...
+//   // The thread_can_start parameter is optional; you can supply NULL.
+//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+//   thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void (*UserThreadFunc)(T);
+
+  ThreadWithParam(
+      UserThreadFunc func, T param, Notification* thread_can_start)
+      : func_(func),
+        param_(param),
+        thread_can_start_(thread_can_start),
+        finished_(false) {
+    ThreadWithParamBase* const base = this;
+    // The thread can be created only after all fields except thread_
+    // have been initialized.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
+  }
+  ~ThreadWithParam() { Join(); }
+
+  void Join() {
+    if (!finished_) {
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
+      finished_ = true;
+    }
+  }
+
+  virtual void Run() {
+    if (thread_can_start_ != NULL)
+      thread_can_start_->WaitForNotification();
+    func_(param_);
+  }
+
+ private:
+  const UserThreadFunc func_;  // User-supplied thread function.
+  const T param_;  // User-supplied parameter to the thread function.
+  // When non-NULL, used to block execution until the controller thread
+  // notifies.
+  Notification* const thread_can_start_;
+  bool finished_;  // true iff we know that the thread function has finished.
+  pthread_t thread_;  // The native thread object.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms. They
+// are used in conjunction with class MutexLock:
+//
+//   Mutex mutex;
+//   ...
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the end
+//                            // of the current scope.
+//
+// MutexBase implements behavior for both statically and dynamically
+// allocated mutexes.  Do not use MutexBase directly.  Instead, write
+// the following to define a static mutex:
+//
+//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//
+// You can forward declare a static mutex like this:
+//
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// To create a dynamic mutex, just define an object of type Mutex.
+class MutexBase {
+ public:
+  // Acquires this mutex.
+  void Lock() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+    owner_ = pthread_self();
+    has_owner_ = true;
+  }
+
+  // Releases this mutex.
+  void Unlock() {
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
+    // mutex when this is called.
+    has_owner_ = false;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+  }
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld() const {
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+        << "The current thread is not holding the mutex @" << this;
+  }
+
+  // A static mutex may be used before main() is entered.  It may even
+  // be used before the dynamic initialization stage.  Therefore we
+  // must be able to initialize a static mutex object at link time.
+  // This means MutexBase has to be a POD and its member variables
+  // have to be public.
+ public:
+  pthread_mutex_t mutex_;  // The underlying pthread mutex.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+    extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+    ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false }
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+  Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+    has_owner_ = false;
+  }
+  ~Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+// We cannot name this class MutexLock as the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms.  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(MutexBase* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  MutexBase* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage.  Therefore it cannot be templatized to access
+// ThreadLocal<T>.  Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
+//
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
+//
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// An object managed for a thread by a ThreadLocal instance is deleted
+// when the thread exits.  Or, if the ThreadLocal instance dies in
+// that thread, when the ThreadLocal dies.  It's the user's
+// responsibility to ensure that all other threads using a ThreadLocal
+// have exited when it dies, or the per-thread objects for those
+// threads will not be deleted.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal() : key_(CreateKey()),
+                  default_() {}
+  explicit ThreadLocal(const T& value) : key_(CreateKey()),
+                                         default_(value) {}
+
+  ~ThreadLocal() {
+    // Destroys the managed object for the current thread, if any.
+    DeleteThreadLocalValue(pthread_getspecific(key_));
+
+    // Releases resources associated with the key.  This will *not*
+    // delete managed objects for other threads.
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
+  }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of type T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+  static pthread_key_t CreateKey() {
+    pthread_key_t key;
+    // When a thread exits, DeleteThreadLocalValue() will be called on
+    // the object managed for that thread.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_key_create(&key, &DeleteThreadLocalValue));
+    return key;
+  }
+
+  T* GetOrCreateValue() const {
+    ThreadLocalValueHolderBase* const holder =
+        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
+    if (holder != NULL) {
+      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
+    }
+
+    ValueHolder* const new_holder = new ValueHolder(default_);
+    ThreadLocalValueHolderBase* const holder_base = new_holder;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
+    return new_holder->pointer();
+  }
+
+  // A key pthreads uses for looking up per-thread values.
+  const pthread_key_t key_;
+  const T default_;  // The default value for each thread.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# define GTEST_IS_THREADSAFE 1
+
+#else  // GTEST_HAS_PTHREAD
+
+// A dummy implementation of synchronization primitives (mutex, lock,
+// and thread-local variable).  Necessary for compiling Google Test where
+// mutex is not supported - using Google Test in multiple threads is not
+// supported on such platforms.
+
+class Mutex {
+ public:
+  Mutex() {}
+  void Lock() {}
+  void Unlock() {}
+  void AssertHeld() const {}
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex*) {}  // NOLINT
+};
+
+typedef GTestMutexLock MutexLock;
+
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal() : value_() {}
+  explicit ThreadLocal(const T& value) : value_(value) {}
+  T* pointer() { return &value_; }
+  const T* pointer() const { return &value_; }
+  const T& get() const { return value_; }
+  void set(const T& value) { value_ = value; }
+ private:
+  T value_;
+};
+
+// The above synchronization primitives have dummy implementations.
+// Therefore Google Test is not thread-safe.
+# define GTEST_IS_THREADSAFE 0
+
+#endif  // GTEST_HAS_PTHREAD
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+GTEST_API_ size_t GetThreadCount();
+
+// Passing non-POD classes through ellipsis (...) crashes the ARM
+// compiler and generates a warning in Sun Studio.  The Nokia Symbian
+// and the IBM XL C/C++ compiler try to instantiate a copy constructor
+// for objects passed through ellipsis (...), failing for uncopyable
+// objects.  We define this to ensure that only POD is passed through
+// ellipsis on these systems.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_ELLIPSIS_NEEDS_POD_ 1
+#else
+# define GTEST_CAN_COMPARE_NULL 1
+#endif
+
+// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between
+// const T& and const T* in a function template.  These compilers
+// _can_ decide between class template specializations for T and T*,
+// so a tr1::type_traits-like is_pointer works.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__)
+# define GTEST_NEEDS_IS_POINTER_ 1
+#endif
+
+template <bool bool_value>
+struct bool_constant {
+  typedef bool_constant<bool_value> type;
+  static const bool value = bool_value;
+};
+template <bool bool_value> const bool bool_constant<bool_value>::value;
+
+typedef bool_constant<false> false_type;
+typedef bool_constant<true> true_type;
+
+template <typename T>
+struct is_pointer : public false_type {};
+
+template <typename T>
+struct is_pointer<T*> : public true_type {};
+
+template <typename Iterator>
+struct IteratorTraits {
+  typedef typename Iterator::value_type value_type;
+};
+
+template <typename T>
+struct IteratorTraits<T*> {
+  typedef T value_type;
+};
+
+template <typename T>
+struct IteratorTraits<const T*> {
+  typedef T value_type;
+};
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_SEP_ "\\"
+# define GTEST_HAS_ALT_PATH_SEP_ 1
+// The biggest signed integer type the compiler supports.
+typedef __int64 BiggestInt;
+#else
+# define GTEST_PATH_SEP_ "/"
+# define GTEST_HAS_ALT_PATH_SEP_ 0
+typedef long long BiggestInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+// Utilities for char.
+
+// isspace(int ch) and friends accept an unsigned char or EOF.  char
+// may be signed, depending on the compiler (or compiler flags).
+// Therefore we need to cast a char to unsigned char before calling
+// isspace(), etc.
+
+inline bool IsAlpha(char ch) {
+  return isalpha(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsAlNum(char ch) {
+  return isalnum(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsDigit(char ch) {
+  return isdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsLower(char ch) {
+  return islower(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsSpace(char ch) {
+  return isspace(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsUpper(char ch) {
+  return isupper(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(char ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(wchar_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+
+inline char ToLower(char ch) {
+  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+}
+inline char ToUpper(char ch) {
+  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
+}
+
+// The testing::internal::posix namespace holds wrappers for common
+// POSIX functions.  These wrappers hide the differences between
+// Windows/MSVC and POSIX systems.  Since some compilers define these
+// standard functions as macros, the wrapper cannot have the same name
+// as the wrapped function.
+
+namespace posix {
+
+// Functions with a different name on Windows.
+
+#if GTEST_OS_WINDOWS
+
+typedef struct _stat StatStruct;
+
+# ifdef __BORLANDC__
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+# else  // !__BORLANDC__
+#  if GTEST_OS_WINDOWS_MOBILE
+inline int IsATTY(int /* fd */) { return 0; }
+#  else
+inline int IsATTY(int fd) { return _isatty(fd); }
+#  endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return _stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return _strdup(src); }
+# endif  // __BORLANDC__
+
+# if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
+// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
+// time and thus not defined there.
+# else
+inline int FileNo(FILE* file) { return _fileno(file); }
+inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
+inline int RmDir(const char* dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct& st) {
+  return (_S_IFDIR & st.st_mode) != 0;
+}
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+#else
+
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#endif  // GTEST_OS_WINDOWS
+
+// Functions deprecated by MSVC 8.0.
+
+#ifdef _MSC_VER
+// Temporarily disable warning 4996 (deprecated function).
+# pragma warning(push)
+# pragma warning(disable:4996)
+#endif
+
+inline const char* StrNCpy(char* dest, const char* src, size_t n) {
+  return strncpy(dest, src, n);
+}
+
+// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
+// StrError() aren't needed on Windows CE at this time and thus not
+// defined there.
+
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int ChDir(const char* dir) { return chdir(dir); }
+#endif
+inline FILE* FOpen(const char* path, const char* mode) {
+  return fopen(path, mode);
+}
+#if !GTEST_OS_WINDOWS_MOBILE
+inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+  return freopen(path, mode, stream);
+}
+inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
+#endif
+inline int FClose(FILE* fp) { return fclose(fp); }
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int Read(int fd, void* buf, unsigned int count) {
+  return static_cast<int>(read(fd, buf, count));
+}
+inline int Write(int fd, const void* buf, unsigned int count) {
+  return static_cast<int>(write(fd, buf, count));
+}
+inline int Close(int fd) { return close(fd); }
+inline const char* StrError(int errnum) { return strerror(errnum); }
+#endif
+inline const char* GetEnv(const char* name) {
+#if GTEST_OS_WINDOWS_MOBILE
+  // We are on Windows CE, which has no environment variables.
+  return NULL;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (NULL).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != NULL && env[0] != '\0') ? env : NULL;
+#else
+  return getenv(name);
+#endif
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)  // Restores the warning state.
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE has no C library. The abort() function is used in
+// several places in Google Test. This implementation provides a reasonable
+// imitation of standard behaviour.
+void Abort();
+#else
+inline void Abort() { abort(); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+}  // namespace posix
+
+// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that.  We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+# define GTEST_SNPRINTF_(buffer, size, format, ...) \
+     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't
+// complain about _snprintf.
+# define GTEST_SNPRINTF_ _snprintf
+#else
+# define GTEST_SNPRINTF_ snprintf
+#endif
+
+// The maximum number a BiggestInt can represent.  This definition
+// works no matter BiggestInt is represented in one's complement or
+// two's complement.
+//
+// We cannot rely on numeric_limits in STL, as __int64 and long long
+// are not part of standard C++ and numeric_limits doesn't need to be
+// defined for them.
+const BiggestInt kMaxBiggestInt =
+    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
+
+// This template class serves as a compile-time function from size to
+// type.  It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+//   TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs.  Other types can be easily added in the future if need
+// arises.
+template <size_t size>
+class TypeWithSize {
+ public:
+  // This prevents the user from using TypeWithSize<N> with incorrect
+  // values of N.
+  typedef void UInt;
+};
+
+// The specialization for size 4.
+template <>
+class TypeWithSize<4> {
+ public:
+  // unsigned int has size 4 in both gcc and MSVC.
+  //
+  // As base/basictypes.h doesn't compile on Windows, we cannot use
+  // uint32, uint64, and etc here.
+  typedef int Int;
+  typedef unsigned int UInt;
+};
+
+// The specialization for size 8.
+template <>
+class TypeWithSize<8> {
+ public:
+#if GTEST_OS_WINDOWS
+  typedef __int64 Int;
+  typedef unsigned __int64 UInt;
+#else
+  typedef long long Int;  // NOLINT
+  typedef unsigned long long UInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+};
+
+// Integer types of known sizes.
+typedef TypeWithSize<4>::Int Int32;
+typedef TypeWithSize<4>::UInt UInt32;
+typedef TypeWithSize<8>::Int Int64;
+typedef TypeWithSize<8>::UInt UInt64;
+typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
+
+// Utilities for command line flags and environment variables.
+
+// Macro for referencing flags.
+#define GTEST_FLAG(name) FLAGS_gtest_##name
+
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+#define GTEST_DECLARE_int32_(name) \
+    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
+#define GTEST_DECLARE_string_(name) \
+    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+
+// Thread annotations
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+// TODO(chandlerc): Find a better way to refactor flag and environment parsing
+// out of both gtest-port.cc and gtest.cc to avoid exporting this utility
+// function.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value);
+
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromGTestEnv(const char* flag, bool default_val);
+GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
+const char* StringFromGTestEnv(const char* flag, const char* default_val);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+#if GTEST_OS_LINUX
+# include <stdlib.h>
+# include <sys/types.h>
+# include <sys/wait.h>
+# include <unistd.h>
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+#include <iomanip>
+#include <limits>
+#include <set>
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the Message class.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+
+#include <limits>
+
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret&, int);
+
+namespace testing {
+
+// The Message class works like an ostream repeater.
+//
+// Typical usage:
+//
+//   1. You stream a bunch of values to a Message object.
+//      It will remember the text in a stringstream.
+//   2. Then you stream the Message object to an ostream.
+//      This causes the text in the Message to be streamed
+//      to the ostream.
+//
+// For example;
+//
+//   testing::Message foo;
+//   foo << 1 << " != " << 2;
+//   std::cout << foo;
+//
+// will print "1 != 2".
+//
+// Message is not intended to be inherited from.  In particular, its
+// destructor is not virtual.
+//
+// Note that stringstream behaves differently in gcc and in MSVC.  You
+// can stream a NULL char pointer to it in the former, but not in the
+// latter (it causes an access violation if you do).  The Message
+// class hides this difference by treating a NULL char pointer as
+// "(null)".
+class GTEST_API_ Message {
+ private:
+  // The type of basic IO manipulators (endl, ends, and flush) for
+  // narrow streams.
+  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
+
+ public:
+  // Constructs an empty Message.
+  Message();
+
+  // Copy constructor.
+  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
+    *ss_ << msg.GetString();
+  }
+
+  // Constructs a Message from a C-string.
+  explicit Message(const char* str) : ss_(new ::std::stringstream) {
+    *ss_ << str;
+  }
+
+#if GTEST_OS_SYMBIAN
+  // Streams a value (either a pointer or not) to this object.
+  template <typename T>
+  inline Message& operator <<(const T& value) {
+    StreamHelper(typename internal::is_pointer<T>::type(), value);
+    return *this;
+  }
+#else
+  // Streams a non-pointer value to this object.
+  template <typename T>
+  inline Message& operator <<(const T& val) {
+    // Some libraries overload << for STL containers.  These
+    // overloads are defined in the global namespace instead of ::std.
+    //
+    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+    // overloads are visible in either the std namespace or the global
+    // namespace, but not other namespaces, including the testing
+    // namespace which Google Test's Message class is in.
+    //
+    // To allow STL containers (and other types that has a << operator
+    // defined in the global namespace) to be used in Google Test
+    // assertions, testing::Message must access the custom << operator
+    // from the global namespace.  With this using declaration,
+    // overloads of << defined in the global namespace and those
+    // visible via Koenig lookup are both exposed in this function.
+    using ::operator <<;
+    *ss_ << val;
+    return *this;
+  }
+
+  // Streams a pointer value to this object.
+  //
+  // This function is an overload of the previous one.  When you
+  // stream a pointer to a Message, this definition will be used as it
+  // is more specialized.  (The C++ Standard, section
+  // [temp.func.order].)  If you stream a non-pointer, then the
+  // previous definition will be used.
+  //
+  // The reason for this overload is that streaming a NULL pointer to
+  // ostream is undefined behavior.  Depending on the compiler, you
+  // may get "0", "(nil)", "(null)", or an access violation.  To
+  // ensure consistent result across compilers, we always treat NULL
+  // as "(null)".
+  template <typename T>
+  inline Message& operator <<(T* const& pointer) {  // NOLINT
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+    return *this;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // Since the basic IO manipulators are overloaded for both narrow
+  // and wide streams, we have to provide this specialized definition
+  // of operator <<, even though its body is the same as the
+  // templatized version above.  Without this definition, streaming
+  // endl or other basic IO manipulators to Message will confuse the
+  // compiler.
+  Message& operator <<(BasicNarrowIoManip val) {
+    *ss_ << val;
+    return *this;
+  }
+
+  // Instead of 1/0, we want to see true/false for bool values.
+  Message& operator <<(bool b) {
+    return *this << (b ? "true" : "false");
+  }
+
+  // These two overloads allow streaming a wide C string to a Message
+  // using the UTF-8 encoding.
+  Message& operator <<(const wchar_t* wide_c_str);
+  Message& operator <<(wchar_t* wide_c_str);
+
+#if GTEST_HAS_STD_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::std::wstring& wstr);
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::wstring& wstr);
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+  // Gets the text streamed to this object so far as an std::string.
+  // Each '\0' character in the buffer is replaced with "\\0".
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  std::string GetString() const;
+
+ private:
+
+#if GTEST_OS_SYMBIAN
+  // These are needed as the Nokia Symbian Compiler cannot decide between
+  // const T& and const T* in a function template. The Nokia compiler _can_
+  // decide between class template specializations for T and T*, so a
+  // tr1::type_traits-like is_pointer works, and we can overload on that.
+  template <typename T>
+  inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) {
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+  }
+  template <typename T>
+  inline void StreamHelper(internal::false_type /*is_pointer*/,
+                           const T& value) {
+    // See the comments in Message& operator <<(const T&) above for why
+    // we need this using statement.
+    using ::operator <<;
+    *ss_ << value;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // We'll hold the text streamed to this object here.
+  const internal::scoped_ptr< ::std::stringstream> ss_;
+
+  // We declare (but don't implement) this to prevent the compiler
+  // from implementing the assignment operator.
+  void operator=(const Message&);
+};
+
+// Streams a Message to an ostream.
+inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+  return os << sb.GetString();
+}
+
+namespace internal {
+
+// Converts a streamable value to an std::string.  A NULL pointer is
+// converted to "(null)".  When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T& streamable) {
+  return (Message() << streamable).GetString();
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by <gtest/internal/gtest-internal.h>.
+// It should not be #included by other files.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
+// string.h is not guaranteed to provide strcpy on C++ Builder.
+# include <mem.h>
+#endif
+
+#include <string.h>
+#include <string>
+
+
+namespace testing {
+namespace internal {
+
+// String - an abstract class holding static string utilities.
+class GTEST_API_ String {
+ public:
+  // Static utility methods
+
+  // Clones a 0-terminated C string, allocating memory using new.  The
+  // caller is responsible for deleting the return value using
+  // delete[].  Returns the cloned string, or NULL if the input is
+  // NULL.
+  //
+  // This is different from strdup() in string.h, which allocates
+  // memory using malloc().
+  static const char* CloneCString(const char* c_str);
+
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
+  // able to pass strings to Win32 APIs on CE we need to convert them
+  // to 'Unicode', UTF-16.
+
+  // Creates a UTF-16 wide string from the given ANSI string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the wide string, or NULL if the
+  // input is NULL.
+  //
+  // The wide string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static LPCWSTR AnsiToUtf16(const char* c_str);
+
+  // Creates an ANSI string from the given wide string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the ANSI string, or NULL if the
+  // input is NULL.
+  //
+  // The returned string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
+#endif
+
+  // Compares two C strings.  Returns true iff they have the same content.
+  //
+  // Unlike strcmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CStringEquals(const char* lhs, const char* rhs);
+
+  // Converts a wide C string to a String using the UTF-8 encoding.
+  // NULL will be converted to "(null)".  If an error occurred during
+  // the conversion, "(failed to convert from wide string)" is
+  // returned.
+  static std::string ShowWideCString(const wchar_t* wide_c_str);
+
+  // Compares two wide C strings.  Returns true iff they have the same
+  // content.
+  //
+  // Unlike wcscmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
+
+  // Compares two C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike strcasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CaseInsensitiveCStringEquals(const char* lhs,
+                                           const char* rhs);
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                               const wchar_t* rhs);
+
+  // Returns true iff the given string ends with the given suffix, ignoring
+  // case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(
+      const std::string& str, const std::string& suffix);
+
+  // Formats an int value as "%02d".
+  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
+
+  // Formats an int value as "%X".
+  static std::string FormatHexInt(int value);
+
+  // Formats a byte as "%02X".
+  static std::string FormatByte(unsigned char value);
+
+ private:
+  String();  // Not meant to be instantiated.
+};  // class String
+
+// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
+// character in the buffer is replaced with "\\0".
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: keith.ray@gmail.com (Keith Ray)
+//
+// Google Test filepath utilities
+//
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included in <gtest/internal/gtest-internal.h>.
+// Do not include this header file separately!
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+
+namespace testing {
+namespace internal {
+
+// FilePath - a class for file and directory pathname manipulation which
+// handles platform-specific conventions (like the pathname separator).
+// Used for helper functions for naming files in a directory for xml output.
+// Except for Set methods, all methods are const or static, which provides an
+// "immutable value object" -- useful for peace of mind.
+// A FilePath with a value ending in a path separator ("like/this/") represents
+// a directory, otherwise it is assumed to represent a file. In either case,
+// it may or may not represent an actual file or directory in the file system.
+// Names are NOT checked for syntax correctness -- no checking for illegal
+// characters, malformed paths, etc.
+
+class GTEST_API_ FilePath {
+ public:
+  FilePath() : pathname_("") { }
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+
+  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
+    Normalize();
+  }
+
+  FilePath& operator=(const FilePath& rhs) {
+    Set(rhs);
+    return *this;
+  }
+
+  void Set(const FilePath& rhs) {
+    pathname_ = rhs.pathname_;
+  }
+
+  const std::string& string() const { return pathname_; }
+  const char* c_str() const { return pathname_.c_str(); }
+
+  // Returns the current working directory, or "" if unsuccessful.
+  static FilePath GetCurrentDir();
+
+  // Given directory = "dir", base_name = "test", number = 0,
+  // extension = "xml", returns "dir/test.xml". If number is greater
+  // than zero (e.g., 12), returns "dir/test_12.xml".
+  // On Windows platform, uses \ as the separator rather than /.
+  static FilePath MakeFileName(const FilePath& directory,
+                               const FilePath& base_name,
+                               int number,
+                               const char* extension);
+
+  // Given directory = "dir", relative_path = "test.xml",
+  // returns "dir/test.xml".
+  // On Windows, uses \ as the separator rather than /.
+  static FilePath ConcatPaths(const FilePath& directory,
+                              const FilePath& relative_path);
+
+  // Returns a pathname for a file that does not currently exist. The pathname
+  // will be directory/base_name.extension or
+  // directory/base_name_<number>.extension if directory/base_name.extension
+  // already exists. The number will be incremented until a pathname is found
+  // that does not already exist.
+  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+  // There could be a race condition if two or more processes are calling this
+  // function at the same time -- they could both pick the same filename.
+  static FilePath GenerateUniqueFileName(const FilePath& directory,
+                                         const FilePath& base_name,
+                                         const char* extension);
+
+  // Returns true iff the path is "".
+  bool IsEmpty() const { return pathname_.empty(); }
+
+  // If input name has a trailing separator character, removes it and returns
+  // the name, otherwise return the name string unmodified.
+  // On Windows platform, uses \ as the separator, other platforms use /.
+  FilePath RemoveTrailingPathSeparator() const;
+
+  // Returns a copy of the FilePath with the directory part removed.
+  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
+  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
+  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
+  // returns an empty FilePath ("").
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveDirectoryName() const;
+
+  // RemoveFileName returns the directory path with the filename removed.
+  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveFileName() const;
+
+  // Returns a copy of the FilePath with the case-insensitive extension removed.
+  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+  // FilePath("dir/file"). If a case-insensitive extension is not
+  // found, returns a copy of the original FilePath.
+  FilePath RemoveExtension(const char* extension) const;
+
+  // Creates directories so that path exists. Returns true if successful or if
+  // the directories already exist; returns false if unable to create
+  // directories for any reason. Will also return false if the FilePath does
+  // not represent a directory (that is, it doesn't end with a path separator).
+  bool CreateDirectoriesRecursively() const;
+
+  // Create the directory so that path exists. Returns true if successful or
+  // if the directory already exists; returns false if unable to create the
+  // directory for any reason, including if the parent directory does not
+  // exist. Not named "CreateDirectory" because that's a macro on Windows.
+  bool CreateFolder() const;
+
+  // Returns true if FilePath describes something in the file-system,
+  // either a file, directory, or whatever, and that something exists.
+  bool FileOrDirectoryExists() const;
+
+  // Returns true if pathname describes a directory in the file-system
+  // that exists.
+  bool DirectoryExists() const;
+
+  // Returns true if FilePath ends with a path separator, which indicates that
+  // it is intended to represent a directory. Returns false otherwise.
+  // This does NOT check that a directory (or file) actually exists.
+  bool IsDirectory() const;
+
+  // Returns true if pathname describes a root directory. (Windows has one
+  // root directory per disk drive.)
+  bool IsRootDirectory() const;
+
+  // Returns true if pathname describes an absolute path.
+  bool IsAbsolutePath() const;
+
+ private:
+  // Replaces multiple consecutive separators with a single separator.
+  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+  // redundancies that might be in a pathname involving "." or "..".
+  //
+  // A pathname with multiple consecutive separators may occur either through
+  // user error or as a result of some scripts or APIs that generate a pathname
+  // with a trailing separator. On other platforms the same API or script
+  // may NOT generate a pathname with a trailing "/". Then elsewhere that
+  // pathname may have another "/" and pathname components added to it,
+  // without checking for the separator already being there.
+  // The script language and operating system may allow paths like "foo//bar"
+  // but some of the functions in FilePath will not handle that correctly. In
+  // particular, RemoveTrailingPathSeparator() only removes one separator, and
+  // it is called in CreateDirectoriesRecursively() assuming that it will change
+  // a pathname from directory syntax (trailing separator) to filename syntax.
+  //
+  // On Windows this method also replaces the alternate path separator '/' with
+  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
+  // "bar\\foo".
+
+  void Normalize();
+
+  // Returns a pointer to the last occurrence of a valid path separator in
+  // the FilePath. On Windows, for example, both '/' and '\' are valid path
+  // separators. Returns NULL if no path separator was found.
+  const char* FindLastPathSeparator() const;
+
+  std::string pathname_;
+};  // class FilePath
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+// This file was GENERATED by command:
+//     pump.py gtest-type-util.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently we support at most 50 types in a list, and at most 50
+// type-parameterized tests in one type-parameterized test case.
+// Please contact googletestframework@googlegroups.com if you need
+// more.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+# if GTEST_HAS_RTTI
+
+  const char* const name = typeid(T).name();
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#   if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#   endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return name_str;
+#  else
+  return name;
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+# else
+
+  return "<type>";
+
+# endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
+// type.  This can be used as a compile-time assertion to ensure that
+// two types are equal.
+
+template <typename T1, typename T2>
+struct AssertTypeEq;
+
+template <typename T>
+struct AssertTypeEq<T, T> {
+  typedef bool type;
+};
+
+// A unique type used as the default value for the arguments of class
+// template Types.  This allows us to simulate variadic templates
+// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
+// support directly.
+struct None {};
+
+// The following family of struct and struct templates are used to
+// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
+// represents a type list with N types (T1, T2, ..., and TN) in it.
+// Except for Types0, every struct in the family has two member types:
+// Head for the first type in the list, and Tail for the rest of the
+// list.
+
+// The empty type list.
+struct Types0 {};
+
+// Type lists of length 1, 2, 3, and so on.
+
+template <typename T1>
+struct Types1 {
+  typedef T1 Head;
+  typedef Types0 Tail;
+};
+template <typename T1, typename T2>
+struct Types2 {
+  typedef T1 Head;
+  typedef Types1<T2> Tail;
+};
+
+template <typename T1, typename T2, typename T3>
+struct Types3 {
+  typedef T1 Head;
+  typedef Types2<T2, T3> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types4 {
+  typedef T1 Head;
+  typedef Types3<T2, T3, T4> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types5 {
+  typedef T1 Head;
+  typedef Types4<T2, T3, T4, T5> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types6 {
+  typedef T1 Head;
+  typedef Types5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types7 {
+  typedef T1 Head;
+  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types8 {
+  typedef T1 Head;
+  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types9 {
+  typedef T1 Head;
+  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types10 {
+  typedef T1 Head;
+  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types11 {
+  typedef T1 Head;
+  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types12 {
+  typedef T1 Head;
+  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types13 {
+  typedef T1 Head;
+  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types14 {
+  typedef T1 Head;
+  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types15 {
+  typedef T1 Head;
+  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types16 {
+  typedef T1 Head;
+  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types17 {
+  typedef T1 Head;
+  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types18 {
+  typedef T1 Head;
+  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types19 {
+  typedef T1 Head;
+  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types20 {
+  typedef T1 Head;
+  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types21 {
+  typedef T1 Head;
+  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types22 {
+  typedef T1 Head;
+  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types23 {
+  typedef T1 Head;
+  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types24 {
+  typedef T1 Head;
+  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types25 {
+  typedef T1 Head;
+  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types26 {
+  typedef T1 Head;
+  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types27 {
+  typedef T1 Head;
+  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types28 {
+  typedef T1 Head;
+  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types29 {
+  typedef T1 Head;
+  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types30 {
+  typedef T1 Head;
+  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types31 {
+  typedef T1 Head;
+  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types32 {
+  typedef T1 Head;
+  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types33 {
+  typedef T1 Head;
+  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types34 {
+  typedef T1 Head;
+  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types35 {
+  typedef T1 Head;
+  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types36 {
+  typedef T1 Head;
+  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types37 {
+  typedef T1 Head;
+  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types38 {
+  typedef T1 Head;
+  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types39 {
+  typedef T1 Head;
+  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types40 {
+  typedef T1 Head;
+  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types41 {
+  typedef T1 Head;
+  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types42 {
+  typedef T1 Head;
+  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types43 {
+  typedef T1 Head;
+  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types44 {
+  typedef T1 Head;
+  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types45 {
+  typedef T1 Head;
+  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types46 {
+  typedef T1 Head;
+  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types47 {
+  typedef T1 Head;
+  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types48 {
+  typedef T1 Head;
+  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types49 {
+  typedef T1 Head;
+  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct Types50 {
+  typedef T1 Head;
+  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+}  // namespace internal
+
+// We don't want to require the users to write TypesN<...> directly,
+// as that would require them to count the length.  Types<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Types<int>
+// will appear as Types<int, None, None, ..., None> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Types<T1, ..., TN>, and Google Test will translate
+// that to TypesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Types template.
+template <typename T1 = internal::None, typename T2 = internal::None,
+    typename T3 = internal::None, typename T4 = internal::None,
+    typename T5 = internal::None, typename T6 = internal::None,
+    typename T7 = internal::None, typename T8 = internal::None,
+    typename T9 = internal::None, typename T10 = internal::None,
+    typename T11 = internal::None, typename T12 = internal::None,
+    typename T13 = internal::None, typename T14 = internal::None,
+    typename T15 = internal::None, typename T16 = internal::None,
+    typename T17 = internal::None, typename T18 = internal::None,
+    typename T19 = internal::None, typename T20 = internal::None,
+    typename T21 = internal::None, typename T22 = internal::None,
+    typename T23 = internal::None, typename T24 = internal::None,
+    typename T25 = internal::None, typename T26 = internal::None,
+    typename T27 = internal::None, typename T28 = internal::None,
+    typename T29 = internal::None, typename T30 = internal::None,
+    typename T31 = internal::None, typename T32 = internal::None,
+    typename T33 = internal::None, typename T34 = internal::None,
+    typename T35 = internal::None, typename T36 = internal::None,
+    typename T37 = internal::None, typename T38 = internal::None,
+    typename T39 = internal::None, typename T40 = internal::None,
+    typename T41 = internal::None, typename T42 = internal::None,
+    typename T43 = internal::None, typename T44 = internal::None,
+    typename T45 = internal::None, typename T46 = internal::None,
+    typename T47 = internal::None, typename T48 = internal::None,
+    typename T49 = internal::None, typename T50 = internal::None>
+struct Types {
+  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Types<internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types0 type;
+};
+template <typename T1>
+struct Types<T1, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types1<T1> type;
+};
+template <typename T1, typename T2>
+struct Types<T1, T2, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types2<T1, T2> type;
+};
+template <typename T1, typename T2, typename T3>
+struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types3<T1, T2, T3> type;
+};
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types4<T1, T2, T3, T4> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types5<T1, T2, T3, T4, T5> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, internal::None, internal::None, internal::None> {
+  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, internal::None, internal::None> {
+  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, T49, internal::None> {
+  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+namespace internal {
+
+# define GTEST_TEMPLATE_ template <typename T> class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
+
+// A unique struct template used as the default value for the
+// arguments of class template Templates.  This allows us to simulate
+// variadic templates (e.g. Templates<int>, Templates<int, double>,
+// and etc), which C++ doesn't support directly.
+template <typename T>
+struct NoneT {};
+
+// The following family of struct and struct templates are used to
+// represent template lists.  In particular, TemplatesN<T1, T2, ...,
+// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
+// for Templates0, every struct in the family has two member types:
+// Head for the selector of the first template in the list, and Tail
+// for the rest of the list.
+
+// The empty template list.
+struct Templates0 {};
+
+// Template lists of length 1, 2, 3, and so on.
+
+template <GTEST_TEMPLATE_ T1>
+struct Templates1 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates0 Tail;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates2 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates1<T2> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates3 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates2<T2, T3> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates4 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates3<T2, T3, T4> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates5 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates4<T2, T3, T4, T5> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates6 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates7 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates8 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates9 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates10 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates11 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates12 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates13 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates14 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates15 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates16 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates17 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates18 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates19 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates20 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates21 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates22 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates23 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates24 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates25 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates26 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates27 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates28 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates29 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates30 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates31 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates32 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates33 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates34 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates35 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates36 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates37 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates38 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates39 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates40 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates41 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates42 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates43 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates44 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates45 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates46 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates47 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates48 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates49 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
+struct Templates50 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+// We don't want to require the users to write TemplatesN<...> directly,
+// as that would require them to count the length.  Templates<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Templates<list>
+// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Templates<T1, ..., TN>, and Google Test will translate
+// that to TemplatesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Templates template.
+template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
+    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
+    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
+    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
+    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
+    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
+    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
+    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
+    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
+    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
+    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
+    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
+    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
+    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
+    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
+    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
+    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
+    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
+    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
+    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
+    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
+    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
+    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
+    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
+    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
+struct Templates {
+  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates0 type;
+};
+template <GTEST_TEMPLATE_ T1>
+struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates1<T1> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates2<T1, T2> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates3<T1, T2, T3> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates4<T1, T2, T3, T4> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates5<T1, T2, T3, T4, T5> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, NoneT, NoneT, NoneT> {
+  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, NoneT, NoneT> {
+  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, T49, NoneT> {
+  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+// The TypeList template makes it possible to use either a single type
+// or a Types<...> list in TYPED_TEST_CASE() and
+// INSTANTIATE_TYPED_TEST_CASE_P().
+
+template <typename T>
+struct TypeList {
+  typedef Types1<T> type;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> > {
+  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+// Due to C++ preprocessor weirdness, we need double indirection to
+// concatenate two tokens when one of them is __LINE__.  Writing
+//
+//   foo ## __LINE__
+//
+// will result in the token foo__LINE__, instead of foo followed by
+// the current line number.  For more details, see
+// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
+#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+
+class ProtocolMessage;
+namespace proto2 { class Message; }
+
+namespace testing {
+
+// Forward declarations.
+
+class AssertionResult;                 // Result of an assertion.
+class Message;                         // Represents a failure message.
+class Test;                            // Represents a test.
+class TestInfo;                        // Information about a test.
+class TestPartResult;                  // Result of a test part.
+class UnitTest;                        // A collection of test cases.
+
+template <typename T>
+::std::string PrintToString(const T& value);
+
+namespace internal {
+
+struct TraceInfo;                      // Information about a trace point.
+class ScopedTrace;                     // Implements scoped trace.
+class TestInfoImpl;                    // Opaque implementation of TestInfo
+class UnitTestImpl;                    // Opaque implementation of UnitTest
+
+// How many times InitGoogleTest() has been called.
+GTEST_API_ extern int g_init_gtest_count;
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+GTEST_API_ extern const char kStackTraceMarker[];
+
+// Two overloaded helpers for checking at compile time whether an
+// expression is a null pointer literal (i.e. NULL or any 0-valued
+// compile-time integral constant).  Their return values have
+// different sizes, so we can use sizeof() to test which version is
+// picked by the compiler.  These helpers have no implementations, as
+// we only need their signatures.
+//
+// Given IsNullLiteralHelper(x), the compiler will pick the first
+// version if x can be implicitly converted to Secret*, and pick the
+// second version otherwise.  Since Secret is a secret and incomplete
+// type, the only expression a user can write that has type Secret* is
+// a null pointer literal.  Therefore, we know that x is a null
+// pointer literal if and only if the first version is picked by the
+// compiler.
+char IsNullLiteralHelper(Secret* p);
+char (&IsNullLiteralHelper(...))[2];  // NOLINT
+
+// A compile-time bool constant that is true if and only if x is a
+// null pointer literal (i.e. NULL or any 0-valued compile-time
+// integral constant).
+#ifdef GTEST_ELLIPSIS_NEEDS_POD_
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_IS_NULL_LITERAL_(x) false
+#else
+# define GTEST_IS_NULL_LITERAL_(x) \
+    (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1)
+#endif  // GTEST_ELLIPSIS_NEEDS_POD_
+
+// Appends the user-supplied message to the Google-Test-generated message.
+GTEST_API_ std::string AppendUserMessage(
+    const std::string& gtest_msg, const Message& user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled).  We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time.  Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult& failure);
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// A helper class for creating scoped traces in user programs.
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+  ScopedTrace(const char* file, int line, const Message& message);
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
+                                     const char* actual_expression,
+                                     const std::string& expected_value,
+                                     const std::string& actual_value,
+                                     bool ignoring_case);
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value);
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison.  (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly.  Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+//   The most-significant bit being the leftmost, an IEEE
+//   floating-point looks like
+//
+//     sign_bit exponent_bits fraction_bits
+//
+//   Here, sign_bit is a single bit that designates the sign of the
+//   number.
+//
+//   For float, there are 8 exponent bits and 23 fraction bits.
+//
+//   For double, there are 11 exponent bits and 52 fraction bits.
+//
+//   More details can be found at
+//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+template <typename RawType>
+class FloatingPoint {
+ public:
+  // Defines the unsigned integer type that has the same size as the
+  // floating point number.
+  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+  // Constants.
+
+  // # of bits in a number.
+  static const size_t kBitCount = 8*sizeof(RawType);
+
+  // # of fraction bits in a number.
+  static const size_t kFractionBitCount =
+    std::numeric_limits<RawType>::digits - 1;
+
+  // # of exponent bits in a number.
+  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+  // The mask for the sign bit.
+  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+  // The mask for the fraction bits.
+  static const Bits kFractionBitMask =
+    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+
+  // The mask for the exponent bits.
+  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+  // How many ULP's (Units in the Last Place) we want to tolerate when
+  // comparing two numbers.  The larger the value, the more error we
+  // allow.  A 0 value means that two numbers must be exactly the same
+  // to be considered equal.
+  //
+  // The maximum error of a single floating-point operation is 0.5
+  // units in the last place.  On Intel CPU's, all floating-point
+  // calculations are done with 80-bit precision, while double has 64
+  // bits.  Therefore, 4 should be enough for ordinary use.
+  //
+  // See the following article for more details on ULP:
+  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static const size_t kMaxUlps = 4;
+
+  // Constructs a FloatingPoint from a raw floating-point number.
+  //
+  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+  // around may change its bits, although the new value is guaranteed
+  // to be also a NAN.  Therefore, don't expect this constructor to
+  // preserve the bits in x when x is a NAN.
+  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
+
+  // Static methods
+
+  // Reinterprets a bit pattern as a floating-point number.
+  //
+  // This function is needed to test the AlmostEquals() method.
+  static RawType ReinterpretBits(const Bits bits) {
+    FloatingPoint fp(0);
+    fp.u_.bits_ = bits;
+    return fp.u_.value_;
+  }
+
+  // Returns the floating-point number that represent positive infinity.
+  static RawType Infinity() {
+    return ReinterpretBits(kExponentBitMask);
+  }
+
+  // Returns the maximum representable finite floating-point number.
+  static RawType Max();
+
+  // Non-static methods
+
+  // Returns the bits that represents this number.
+  const Bits &bits() const { return u_.bits_; }
+
+  // Returns the exponent bits of this number.
+  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+  // Returns the fraction bits of this number.
+  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+  // Returns the sign bit of this number.
+  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+  // Returns true iff this is NAN (not a number).
+  bool is_nan() const {
+    // It's a NAN if the exponent bits are all ones and the fraction
+    // bits are not entirely zeros.
+    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+  }
+
+  // Returns true iff this number is at most kMaxUlps ULP's away from
+  // rhs.  In particular, this function:
+  //
+  //   - returns false if either number is (or both are) NAN.
+  //   - treats really large numbers as almost equal to infinity.
+  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
+  bool AlmostEquals(const FloatingPoint& rhs) const {
+    // The IEEE standard says that any comparison operation involving
+    // a NAN must return false.
+    if (is_nan() || rhs.is_nan()) return false;
+
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
+        <= kMaxUlps;
+  }
+
+ private:
+  // The data type used to store the actual floating-point number.
+  union FloatingPointUnion {
+    RawType value_;  // The raw floating-point number.
+    Bits bits_;      // The bits that represent the number.
+  };
+
+  // Converts an integer from the sign-and-magnitude representation to
+  // the biased representation.  More precisely, let N be 2 to the
+  // power of (kBitCount - 1), an integer x is represented by the
+  // unsigned number x + N.
+  //
+  // For instance,
+  //
+  //   -N + 1 (the most negative number representable using
+  //          sign-and-magnitude) is represented by 1;
+  //   0      is represented by N; and
+  //   N - 1  (the biggest number representable using
+  //          sign-and-magnitude) is represented by 2N - 1.
+  //
+  // Read http://en.wikipedia.org/wiki/Signed_number_representations
+  // for more details on signed number representations.
+  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+    if (kSignBitMask & sam) {
+      // sam represents a negative number.
+      return ~sam + 1;
+    } else {
+      // sam represents a positive number.
+      return kSignBitMask | sam;
+    }
+  }
+
+  // Given two numbers in the sign-and-magnitude representation,
+  // returns the distance between them as an unsigned number.
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
+                                                     const Bits &sam2) {
+    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+  }
+
+  FloatingPointUnion u_;
+};
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+template <>
+inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+
+// Typedefs the instances of the FloatingPoint template class that we
+// care to use.
+typedef FloatingPoint<float> Float;
+typedef FloatingPoint<double> Double;
+
+// In order to catch the mistake of putting tests that use different
+// test fixture classes in the same test case, we need to assign
+// unique IDs to fixture classes and compare them.  The TypeId type is
+// used to hold such IDs.  The user should treat TypeId as an opaque
+// type: the only operation allowed on TypeId values is to compare
+// them for equality using the == operator.
+typedef const void* TypeId;
+
+template <typename T>
+class TypeIdHelper {
+ public:
+  // dummy_ must not have a const type.  Otherwise an overly eager
+  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
+  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
+  static bool dummy_;
+};
+
+template <typename T>
+bool TypeIdHelper<T>::dummy_ = false;
+
+// GetTypeId<T>() returns the ID of type T.  Different values will be
+// returned for different types.  Calling the function twice with the
+// same type argument is guaranteed to return the same ID.
+template <typename T>
+TypeId GetTypeId() {
+  // The compiler is required to allocate a different
+  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
+  // the template.  Therefore, the address of dummy_ is guaranteed to
+  // be unique.
+  return &(TypeIdHelper<T>::dummy_);
+}
+
+// Returns the type ID of ::testing::Test.  Always call this instead
+// of GetTypeId< ::testing::Test>() to get the type ID of
+// ::testing::Test, as the latter may give the wrong result due to a
+// suspected linker bug when compiling Google Test as a Mac OS X
+// framework.
+GTEST_API_ TypeId GetTestTypeId();
+
+// Defines the abstract factory interface that creates instances
+// of a Test object.
+class TestFactoryBase {
+ public:
+  virtual ~TestFactoryBase() {}
+
+  // Creates a test instance to run. The instance is both created and destroyed
+  // within TestInfoImpl::Run()
+  virtual Test* CreateTest() = 0;
+
+ protected:
+  TestFactoryBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+};
+
+// This class provides implementation of TeastFactoryBase interface.
+// It is used in TEST and TEST_F macros.
+template <class TestClass>
+class TestFactoryImpl : public TestFactoryBase {
+ public:
+  virtual Test* CreateTest() { return new TestClass; }
+};
+
+#if GTEST_OS_WINDOWS
+
+// Predicate-formatters for implementing the HRESULT checking macros
+// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
+// We pass a long instead of HRESULT to avoid causing an
+// include dependency for the HRESULT type.
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
+                                            long hr);  // NOLINT
+GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
+                                            long hr);  // NOLINT
+
+#endif  // GTEST_OS_WINDOWS
+
+// Types of SetUpTestCase() and TearDownTestCase() functions.
+typedef void (*SetUpTestCaseFunc)();
+typedef void (*TearDownTestCaseFunc)();
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param        the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param       text representation of the test's value parameter,
+//                     or NULL if this is not a type-parameterized test.
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory);
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// State of the definition of a type-parameterized test case.
+class GTEST_API_ TypedTestCasePState {
+ public:
+  TypedTestCasePState() : registered_(false) {}
+
+  // Adds the given test name to defined_test_names_ and return true
+  // if the test case hasn't been registered; otherwise aborts the
+  // program.
+  bool AddTestName(const char* file, int line, const char* case_name,
+                   const char* test_name) {
+    if (registered_) {
+      fprintf(stderr, "%s Test %s must be defined before "
+              "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n",
+              FormatFileLocation(file, line).c_str(), test_name, case_name);
+      fflush(stderr);
+      posix::Abort();
+    }
+    defined_test_names_.insert(test_name);
+    return true;
+  }
+
+  // Verifies that registered_tests match the test names in
+  // defined_test_names_; returns registered_tests if successful, or
+  // aborts the program otherwise.
+  const char* VerifyRegisteredTestNames(
+      const char* file, int line, const char* registered_tests);
+
+ private:
+  bool registered_;
+  ::std::set<const char*> defined_test_names_;
+};
+
+// Skips to the first non-space char after the first comma in 'str';
+// returns NULL if no comma is found in 'str'.
+inline const char* SkipComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  if (comma == NULL) {
+    return NULL;
+  }
+  while (IsSpace(*(++comma))) {}
+  return comma;
+}
+
+// Returns the prefix of 'str' before the first comma in it; returns
+// the entire string if it contains no comma.
+inline std::string GetPrefixUntilComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  return comma == NULL ? str : std::string(str, comma);
+}
+
+// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
+// registers a list of type-parameterized tests with Google Test.  The
+// return value is insignificant - we just need to return something
+// such that we can call this function in a namespace scope.
+//
+// Implementation note: The GTEST_TEMPLATE_ macro declares a template
+// template parameter.  It's defined in gtest-type-util.h.
+template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
+class TypeParameterizedTest {
+ public:
+  // 'index' is the index of the test in the type list 'Types'
+  // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
+  // Types).  Valid values for 'index' are [0, N - 1] where N is the
+  // length of Types.
+  static bool Register(const char* prefix, const char* case_name,
+                       const char* test_names, int index) {
+    typedef typename Types::Head Type;
+    typedef Fixture<Type> FixtureClass;
+    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
+
+    // First, registers the first type-parameterized test in the type
+    // list.
+    MakeAndRegisterTestInfo(
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
+         + StreamableToString(index)).c_str(),
+        GetPrefixUntilComma(test_names).c_str(),
+        GetTypeName<Type>().c_str(),
+        NULL,  // No value parameter.
+        GetTypeId<FixtureClass>(),
+        TestClass::SetUpTestCase,
+        TestClass::TearDownTestCase,
+        new TestFactoryImpl<TestClass>);
+
+    // Next, recurses (at compile time) with the tail of the type list.
+    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
+        ::Register(prefix, case_name, test_names, index + 1);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, class TestSel>
+class TypeParameterizedTest<Fixture, TestSel, Types0> {
+ public:
+  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
+                       const char* /*test_names*/, int /*index*/) {
+    return true;
+  }
+};
+
+// TypeParameterizedTestCase<Fixture, Tests, Types>::Register()
+// registers *all combinations* of 'Tests' and 'Types' with Google
+// Test.  The return value is insignificant - we just need to return
+// something such that we can call this function in a namespace scope.
+template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
+class TypeParameterizedTestCase {
+ public:
+  static bool Register(const char* prefix, const char* case_name,
+                       const char* test_names) {
+    typedef typename Tests::Head Head;
+
+    // First, register the first test in 'Test' for each type in 'Types'.
+    TypeParameterizedTest<Fixture, Head, Types>::Register(
+        prefix, case_name, test_names, 0);
+
+    // Next, recurses (at compile time) with the tail of the test list.
+    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
+        ::Register(prefix, case_name, SkipComma(test_names));
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, typename Types>
+class TypeParameterizedTestCase<Fixture, Templates0, Types> {
+ public:
+  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
+                       const char* /*test_names*/) {
+    return true;
+  }
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
+    UnitTest* unit_test, int skip_count);
+
+// Helpers for suppressing warnings on unreachable code or constant
+// condition.
+
+// Always returns true.
+GTEST_API_ bool AlwaysTrue();
+
+// Always returns false.
+inline bool AlwaysFalse() { return !AlwaysTrue(); }
+
+// Helper for suppressing false warning from Clang on a const char*
+// variable declared in a conditional expression always being NULL in
+// the else branch.
+struct GTEST_API_ ConstCharPtr {
+  ConstCharPtr(const char* str) : value(str) {}
+  operator bool() const { return true; }
+  const char* value;
+};
+
+// A simple Linear Congruential Generator for generating random
+// numbers with a uniform distribution.  Unlike rand() and srand(), it
+// doesn't use global state (and therefore can't interfere with user
+// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
+// but it's good enough for our purposes.
+class GTEST_API_ Random {
+ public:
+  static const UInt32 kMaxRange = 1u << 31;
+
+  explicit Random(UInt32 seed) : state_(seed) {}
+
+  void Reseed(UInt32 seed) { state_ = seed; }
+
+  // Generates a random number from [0, range).  Crashes if 'range' is
+  // 0 or greater than kMaxRange.
+  UInt32 Generate(UInt32 range);
+
+ private:
+  UInt32 state_;
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+};
+
+// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
+// compiler error iff T1 and T2 are different types.
+template <typename T1, typename T2>
+struct CompileAssertTypesEqual;
+
+template <typename T>
+struct CompileAssertTypesEqual<T, T> {
+};
+
+// Removes the reference from a type if it is a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::remove_reference, which is not widely available yet.
+template <typename T>
+struct RemoveReference { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveReference<T&> { typedef T type; };  // NOLINT
+
+// A handy wrapper around RemoveReference that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_REFERENCE_(T) \
+    typename ::testing::internal::RemoveReference<T>::type
+
+// Removes const from a type if it is a const type, otherwise leaves
+// it unchanged.  This is the same as tr1::remove_const, which is not
+// widely available yet.
+template <typename T>
+struct RemoveConst { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveConst<const T> { typedef T type; };  // NOLINT
+
+// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
+// definition to fail to remove the const in 'const int[3]' and 'const
+// char[3][4]'.  The following specialization works around the bug.
+template <typename T, size_t N>
+struct RemoveConst<const T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+
+#if defined(_MSC_VER) && _MSC_VER < 1400
+// This is the only specialization that allows VC++ 7.1 to remove const in
+// 'const int[3] and 'const int[3][4]'.  However, it causes trouble with GCC
+// and thus needs to be conditionally compiled.
+template <typename T, size_t N>
+struct RemoveConst<T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+#endif
+
+// A handy wrapper around RemoveConst that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_CONST_(T) \
+    typename ::testing::internal::RemoveConst<T>::type
+
+// Turns const U&, U&, const U, and U all into U.
+#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
+    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
+
+// Adds reference to a type if it is not a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::add_reference, which is not widely available yet.
+template <typename T>
+struct AddReference { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddReference<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper around AddReference that works when the argument T
+// depends on template parameters.
+#define GTEST_ADD_REFERENCE_(T) \
+    typename ::testing::internal::AddReference<T>::type
+
+// Adds a reference to const on top of T as necessary.  For example,
+// it transforms
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> const char&
+//   const char&  ==> const char&
+//
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
+
+// ImplicitlyConvertible<From, To>::value is a compile-time bool
+// constant that's true iff type From can be implicitly converted to
+// type To.
+template <typename From, typename To>
+class ImplicitlyConvertible {
+ private:
+  // We need the following helper functions only for their types.
+  // They have no implementations.
+
+  // MakeFrom() is an expression whose type is From.  We cannot simply
+  // use From(), as the type From may not have a public default
+  // constructor.
+  static From MakeFrom();
+
+  // These two functions are overloaded.  Given an expression
+  // Helper(x), the compiler will pick the first version if x can be
+  // implicitly converted to type To; otherwise it will pick the
+  // second version.
+  //
+  // The first version returns a value of size 1, and the second
+  // version returns a value of size 2.  Therefore, by checking the
+  // size of Helper(x), which can be done at compile time, we can tell
+  // which version of Helper() is used, and hence whether x can be
+  // implicitly converted to type To.
+  static char Helper(To);
+  static char (&Helper(...))[2];  // NOLINT
+
+  // We have to put the 'public' section after the 'private' section,
+  // or MSVC refuses to compile the code.
+ public:
+  // MSVC warns about implicitly converting from double to int for
+  // possible loss of data, so we need to temporarily disable the
+  // warning.
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4244)  // Temporarily disables warning 4244.
+
+  static const bool value =
+      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
+# pragma warning(pop)           // Restores the warning state.
+#elif defined(__BORLANDC__)
+  // C++Builder cannot use member overload resolution during template
+  // instantiation.  The simplest workaround is to use its C++0x type traits
+  // functions (C++Builder 2009 and above only).
+  static const bool value = __is_convertible(From, To);
+#else
+  static const bool value =
+      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
+#endif  // _MSV_VER
+};
+template <typename From, typename To>
+const bool ImplicitlyConvertible<From, To>::value;
+
+// IsAProtocolMessage<T>::value is a compile-time bool constant that's
+// true iff T is type ProtocolMessage, proto2::Message, or a subclass
+// of those.
+template <typename T>
+struct IsAProtocolMessage
+    : public bool_constant<
+  ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value ||
+  ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> {
+};
+
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.  Therefore, we can determine whether C is
+// a container class by checking the type of IsContainerTest<C>(0).
+// The value of the expression is insignificant.
+//
+// Note that we look for both C::iterator and C::const_iterator.  The
+// reason is that C++ injects the name of a class as a member of the
+// class itself (e.g. you can refer to class iterator as either
+// 'iterator' or 'iterator::iterator').  If we look for C::iterator
+// only, for example, we would mistakenly think that a class named
+// iterator is an STL container.
+//
+// Also note that the simpler approach of overloading
+// IsContainerTest(typename C::const_iterator*) and
+// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
+typedef int IsContainer;
+template <class C>
+IsContainer IsContainerTest(int /* dummy */,
+                            typename C::iterator* /* it */ = NULL,
+                            typename C::const_iterator* /* const_it */ = NULL) {
+  return 0;
+}
+
+typedef char IsNotContainer;
+template <class C>
+IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+
+// EnableIf<condition>::type is void when 'Cond' is true, and
+// undefined when 'Cond' is false.  To use SFINAE to make a function
+// overload only apply when a particular expression is true, add
+// "typename EnableIf<expression>::type* = 0" as the last parameter.
+template<bool> struct EnableIf;
+template<> struct EnableIf<true> { typedef void type; };  // NOLINT
+
+// Utilities for native arrays.
+
+// ArrayEq() compares two k-dimensional native arrays using the
+// elements' operator==, where k can be any integer >= 0.  When k is
+// 0, ArrayEq() degenerates into comparing a single pair of values.
+
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+  return internal::ArrayEq(lhs, N, rhs);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous ArrayEq() function, arrays with different sizes would
+// lead to different copies of the template code.
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
+  for (size_t i = 0; i != size; i++) {
+    if (!internal::ArrayEq(lhs[i], rhs[i]))
+      return false;
+  }
+  return true;
+}
+
+// Finds the first element in the iterator range [begin, end) that
+// equals elem.  Element may be a native array type itself.
+template <typename Iter, typename Element>
+Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
+  for (Iter it = begin; it != end; ++it) {
+    if (internal::ArrayEq(*it, elem))
+      return it;
+  }
+  return end;
+}
+
+// CopyArray() copies a k-dimensional native array using the elements'
+// operator=, where k can be any integer >= 0.  When k is 0,
+// CopyArray() degenerates into copying a single value.
+
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline void CopyArray(const T& from, U* to) { *to = from; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+  internal::CopyArray(from, N, *to);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous CopyArray() function, arrays with different sizes
+// would lead to different copies of the template code.
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to) {
+  for (size_t i = 0; i != size; i++) {
+    internal::CopyArray(from[i], to + i);
+  }
+}
+
+// The relation between an NativeArray object (see below) and the
+// native array it represents.
+enum RelationToSource {
+  kReference,  // The NativeArray references the native array.
+  kCopy        // The NativeArray makes a copy of the native array and
+               // owns the copy.
+};
+
+// Adapts a native array to a read-only STL-style container.  Instead
+// of the complete STL container concept, this adaptor only implements
+// members useful for Google Mock's container matchers.  New members
+// should be added as needed.  To simplify the implementation, we only
+// support Element being a raw type (i.e. having no top-level const or
+// reference modifier).  It's the client's responsibility to satisfy
+// this requirement.  Element can be an array type itself (hence
+// multi-dimensional arrays are supported).
+template <typename Element>
+class NativeArray {
+ public:
+  // STL-style container typedefs.
+  typedef Element value_type;
+  typedef Element* iterator;
+  typedef const Element* const_iterator;
+
+  // Constructs from a native array.
+  NativeArray(const Element* array, size_t count, RelationToSource relation) {
+    Init(array, count, relation);
+  }
+
+  // Copy constructor.
+  NativeArray(const NativeArray& rhs) {
+    Init(rhs.array_, rhs.size_, rhs.relation_to_source_);
+  }
+
+  ~NativeArray() {
+    // Ensures that the user doesn't instantiate NativeArray with a
+    // const or reference type.
+    static_cast<void>(StaticAssertTypeEqHelper<Element,
+        GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>());
+    if (relation_to_source_ == kCopy)
+      delete[] array_;
+  }
+
+  // STL-style container methods.
+  size_t size() const { return size_; }
+  const_iterator begin() const { return array_; }
+  const_iterator end() const { return array_ + size_; }
+  bool operator==(const NativeArray& rhs) const {
+    return size() == rhs.size() &&
+        ArrayEq(begin(), size(), rhs.begin());
+  }
+
+ private:
+  // Initializes this object; makes a copy of the input array if
+  // 'relation' is kCopy.
+  void Init(const Element* array, size_t a_size, RelationToSource relation) {
+    if (relation == kReference) {
+      array_ = array;
+    } else {
+      Element* const copy = new Element[a_size];
+      CopyArray(array, a_size, copy);
+      array_ = copy;
+    }
+    size_ = a_size;
+    relation_to_source_ = relation;
+  }
+
+  const Element* array_;
+  size_t size_;
+  RelationToSource relation_to_source_;
+
+  GTEST_DISALLOW_ASSIGN_(NativeArray);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
+  ::testing::internal::AssertHelper(result_type, file, line, message) \
+    = ::testing::Message()
+
+#define GTEST_MESSAGE_(message, result_type) \
+  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+
+#define GTEST_FATAL_FAILURE_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+
+#define GTEST_NONFATAL_FAILURE_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define GTEST_SUCCESS_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
+
+// Suppresses MSVC warnings 4072 (unreachable code) for the code following
+// statement if it returns or throws (or doesn't return or throw in some
+// situations).
+#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
+  if (::testing::internal::AlwaysTrue()) { statement; }
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
+    bool gtest_caught_expected = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (expected_exception const&) { \
+      gtest_caught_expected = true; \
+    } \
+    catch (...) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws a different type."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+    if (!gtest_caught_expected) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws nothing."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
+      fail(gtest_msg.value)
+
+#define GTEST_TEST_NO_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
+      fail("Expected: " #statement " doesn't throw an exception.\n" \
+           "  Actual: it throws.")
+
+#define GTEST_TEST_ANY_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    bool gtest_caught_any = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      gtest_caught_any = true; \
+    } \
+    if (!gtest_caught_any) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
+      fail("Expected: " #statement " throws an exception.\n" \
+           "  Actual: it doesn't.")
+
+
+// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
+// either a boolean expression or an AssertionResult. text is a textual
+// represenation of expression as it was passed into the EXPECT_TRUE.
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar_ = \
+      ::testing::AssertionResult(expression)) \
+    ; \
+  else \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(\
+        gtest_ar_, text, #actual, #expected).c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
+      fail("Expected: " #statement " doesn't generate new fatal " \
+           "failures in the current thread.\n" \
+           "  Actual: it does.")
+
+// Expands to the name of the class that implements the given test.
+#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+  test_case_name##_##test_name##_Test
+
+// Helper macro for defining tests.
+#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\
+class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
+ public:\
+  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
+ private:\
+  virtual void TestBody();\
+  static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
+};\
+\
+::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
+  ::test_info_ =\
+    ::testing::internal::MakeAndRegisterTestInfo(\
+        #test_case_name, #test_name, NULL, NULL, \
+        (parent_id), \
+        parent_class::SetUpTestCase, \
+        parent_class::TearDownTestCase, \
+        new ::testing::internal::TestFactoryImpl<\
+            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
+void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for death tests.  It is
+// #included by gtest.h so a user doesn't need to include this
+// directly.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines internal utilities needed for implementing
+// death tests.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+
+#include <stdio.h>
+
+namespace testing {
+namespace internal {
+
+GTEST_DECLARE_string_(internal_run_death_test);
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kDeathTestStyleFlag[] = "death_test_style";
+const char kDeathTestUseFork[] = "death_test_use_fork";
+const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
+
+#if GTEST_HAS_DEATH_TEST
+
+// DeathTest is a class that hides much of the complexity of the
+// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
+// returns a concrete class that depends on the prevailing death test
+// style, as defined by the --gtest_death_test_style and/or
+// --gtest_internal_run_death_test flags.
+
+// In describing the results of death tests, these terms are used with
+// the corresponding definitions:
+//
+// exit status:  The integer exit information in the format specified
+//               by wait(2)
+// exit code:    The integer code passed to exit(3), _exit(2), or
+//               returned from main()
+class GTEST_API_ DeathTest {
+ public:
+  // Create returns false if there was an error determining the
+  // appropriate action to take for the current death test; for example,
+  // if the gtest_death_test_style flag is set to an invalid value.
+  // The LastMessage method will return a more detailed message in that
+  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
+  // argument is set.  If the death test should be skipped, the pointer
+  // is set to NULL; otherwise, it is set to the address of a new concrete
+  // DeathTest object that controls the execution of the current test.
+  static bool Create(const char* statement, const RE* regex,
+                     const char* file, int line, DeathTest** test);
+  DeathTest();
+  virtual ~DeathTest() { }
+
+  // A helper class that aborts a death test when it's deleted.
+  class ReturnSentinel {
+   public:
+    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+   private:
+    DeathTest* const test_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+  } GTEST_ATTRIBUTE_UNUSED_;
+
+  // An enumeration of possible roles that may be taken when a death
+  // test is encountered.  EXECUTE means that the death test logic should
+  // be executed immediately.  OVERSEE means that the program should prepare
+  // the appropriate environment for a child process to execute the death
+  // test, then wait for it to complete.
+  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
+
+  // An enumeration of the three reasons that a test might be aborted.
+  enum AbortReason {
+    TEST_ENCOUNTERED_RETURN_STATEMENT,
+    TEST_THREW_EXCEPTION,
+    TEST_DID_NOT_DIE
+  };
+
+  // Assumes one of the above roles.
+  virtual TestRole AssumeRole() = 0;
+
+  // Waits for the death test to finish and returns its status.
+  virtual int Wait() = 0;
+
+  // Returns true if the death test passed; that is, the test process
+  // exited during the test, its exit status matches a user-supplied
+  // predicate, and its stderr output matches a user-supplied regular
+  // expression.
+  // The user-supplied predicate may be a macro expression rather
+  // than a function pointer or functor, or else Wait and Passed could
+  // be combined.
+  virtual bool Passed(bool exit_status_ok) = 0;
+
+  // Signals that the death test did not die as expected.
+  virtual void Abort(AbortReason reason) = 0;
+
+  // Returns a human-readable outcome message regarding the outcome of
+  // the last death test.
+  static const char* LastMessage();
+
+  static void set_last_death_test_message(const std::string& message);
+
+ private:
+  // A string containing a description of the outcome of the last death test.
+  static std::string last_death_test_message_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+};
+
+// Factory interface for death tests.  May be mocked out for testing.
+class DeathTestFactory {
+ public:
+  virtual ~DeathTestFactory() { }
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test) = 0;
+};
+
+// A concrete DeathTestFactory implementation for normal use.
+class DefaultDeathTestFactory : public DeathTestFactory {
+ public:
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test);
+};
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
+
+// Traps C++ exceptions escaping statement and reports them as test
+// failures. Note that trapping SEH exceptions is not implemented here.
+# if GTEST_HAS_EXCEPTIONS
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  try { \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } catch (const ::std::exception& gtest_exception) { \
+    fprintf(\
+        stderr, \
+        "\n%s: Caught std::exception-derived exception escaping the " \
+        "death test statement. Exception message: %s\n", \
+        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
+        gtest_exception.what()); \
+    fflush(stderr); \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  } catch (...) { \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  }
+
+# else
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+
+# endif
+
+// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
+// ASSERT_EXIT*, and EXPECT_EXIT*.
+# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    const ::testing::internal::RE& gtest_regex = (regex); \
+    ::testing::internal::DeathTest* gtest_dt; \
+    if (!::testing::internal::DeathTest::Create(#statement, &gtest_regex, \
+        __FILE__, __LINE__, &gtest_dt)) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+    } \
+    if (gtest_dt != NULL) { \
+      ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \
+          gtest_dt_ptr(gtest_dt); \
+      switch (gtest_dt->AssumeRole()) { \
+        case ::testing::internal::DeathTest::OVERSEE_TEST: \
+          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
+            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+          } \
+          break; \
+        case ::testing::internal::DeathTest::EXECUTE_TEST: { \
+          ::testing::internal::DeathTest::ReturnSentinel \
+              gtest_sentinel(gtest_dt); \
+          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
+          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
+          break; \
+        } \
+        default: \
+          break; \
+      } \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \
+      fail(::testing::internal::DeathTest::LastMessage())
+// The symbol "fail" here expands to something into which a message
+// can be streamed.
+
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed, the regex is
+// ignored, and the macro must accept a streamed message even though the message
+// is never printed.
+# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } else \
+    ::testing::Message()
+
+// A class representing the parsed contents of the
+// --gtest_internal_run_death_test flag, as it existed when
+// RUN_ALL_TESTS was called.
+class InternalRunDeathTestFlag {
+ public:
+  InternalRunDeathTestFlag(const std::string& a_file,
+                           int a_line,
+                           int an_index,
+                           int a_write_fd)
+      : file_(a_file), line_(a_line), index_(an_index),
+        write_fd_(a_write_fd) {}
+
+  ~InternalRunDeathTestFlag() {
+    if (write_fd_ >= 0)
+      posix::Close(write_fd_);
+  }
+
+  const std::string& file() const { return file_; }
+  int line() const { return line_; }
+  int index() const { return index_; }
+  int write_fd() const { return write_fd_; }
+
+ private:
+  std::string file_;
+  int line_;
+  int index_;
+  int write_fd_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+};
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
+
+#else  // GTEST_HAS_DEATH_TEST
+
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
+// systems that support death tests. This allows one to write such a macro
+// on a system that does not support death tests and be sure that it will
+// compile on a death-test supporting system.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter iff EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+namespace testing {
+
+// This flag controls the style of death tests.  Valid values are "threadsafe",
+// meaning that the death test child process will re-execute the test binary
+// from the start, running only a single death test, or "fast",
+// meaning that the child process will execute the test logic immediately
+// after forking.
+GTEST_DECLARE_string_(death_test_style);
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+}  // namespace internal
+
+// The following macros are useful for writing death tests.
+
+// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
+// executed:
+//
+//   1. It generates a warning if there is more than one active
+//   thread.  This is because it's safe to fork() or clone() only
+//   when there is a single thread.
+//
+//   2. The parent process clone()s a sub-process and runs the death
+//   test in it; the sub-process exits with code 0 at the end of the
+//   death test, if it hasn't exited already.
+//
+//   3. The parent process waits for the sub-process to terminate.
+//
+//   4. The parent process checks the exit code and error message of
+//   the sub-process.
+//
+// Examples:
+//
+//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
+//   for (int i = 0; i < 5; i++) {
+//     EXPECT_DEATH(server.ProcessRequest(i),
+//                  "Invalid request .* in ProcessRequest()")
+//                  << "Failed to die on request " << i;
+//   }
+//
+//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
+//
+//   bool KilledBySIGHUP(int exit_code) {
+//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
+//   }
+//
+//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
+//
+// On the regular expressions used in death tests:
+//
+//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
+//   which uses the POSIX extended regex syntax.
+//
+//   On other platforms (e.g. Windows), we only support a simple regex
+//   syntax implemented as part of Google Test.  This limited
+//   implementation should be enough most of the time when writing
+//   death tests; though it lacks many features you can find in PCRE
+//   or POSIX extended regex syntax.  For example, we don't support
+//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
+//   repetition count ("x{5,7}"), among others.
+//
+//   Below is the syntax that we do support.  We chose it to be a
+//   subset of both PCRE and POSIX extended regex, so it's easy to
+//   learn wherever you come from.  In the following: 'A' denotes a
+//   literal character, period (.), or a single \\ escape sequence;
+//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
+//   natural numbers.
+//
+//     c     matches any literal character c
+//     \\d   matches any decimal digit
+//     \\D   matches any character that's not a decimal digit
+//     \\f   matches \f
+//     \\n   matches \n
+//     \\r   matches \r
+//     \\s   matches any ASCII whitespace, including \n
+//     \\S   matches any character that's not a whitespace
+//     \\t   matches \t
+//     \\v   matches \v
+//     \\w   matches any letter, _, or decimal digit
+//     \\W   matches any character that \\w doesn't match
+//     \\c   matches any literal character c, which must be a punctuation
+//     .     matches any single character except \n
+//     A?    matches 0 or 1 occurrences of A
+//     A*    matches 0 or many occurrences of A
+//     A+    matches 1 or many occurrences of A
+//     ^     matches the beginning of a string (not that of each line)
+//     $     matches the end of a string (not that of each line)
+//     xy    matches x followed by y
+//
+//   If you accidentally use PCRE or POSIX extended regex features
+//   not implemented by us, you will get a run-time failure.  In that
+//   case, please try to rewrite your regular expression within the
+//   above syntax.
+//
+//   This implementation is *not* meant to be as highly tuned or robust
+//   as a compiled regex library, but should perform well enough for a
+//   death test, which already incurs significant overhead by launching
+//   a child process.
+//
+// Known caveats:
+//
+//   A "threadsafe" style death test obtains the path to the test
+//   program from argv[0] and re-executes it in the sub-process.  For
+//   simplicity, the current implementation doesn't search the PATH
+//   when launching the sub-process.  This means that the user must
+//   invoke the test program via a path that contains at least one
+//   path separator (e.g. path/to/foo_test and
+//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
+//   is rarely a problem as people usually don't put the test binary
+//   directory in PATH.
+//
+// TODO(wan@google.com): make thread-safe death tests search the PATH.
+
+// Asserts that a given statement causes the program to exit, with an
+// integer exit status that satisfies predicate, and emitting error output
+// that matches regex.
+# define ASSERT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+
+// Like ASSERT_EXIT, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+
+// Asserts that a given statement causes the program to exit, either by
+// explicitly exiting with a nonzero exit code or being killed by a
+// signal, and emitting error output that matches regex.
+# define ASSERT_DEATH(statement, regex) \
+    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Like ASSERT_DEATH, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_DEATH(statement, regex) \
+    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
+
+// Tests that an exit code describes a normal exit with a given exit code.
+class GTEST_API_ ExitedWithCode {
+ public:
+  explicit ExitedWithCode(int exit_code);
+  bool operator()(int exit_status) const;
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ExitedWithCode& other);
+
+  const int exit_code_;
+};
+
+# if !GTEST_OS_WINDOWS
+// Tests that an exit code describes an exit due to termination by a
+// given signal.
+class GTEST_API_ KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum);
+  bool operator()(int exit_status) const;
+ private:
+  const int signum_;
+};
+# endif  // !GTEST_OS_WINDOWS
+
+// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
+// The death testing framework causes this to have interesting semantics,
+// since the sideeffects of the call are only visible in opt mode, and not
+// in debug mode.
+//
+// In practice, this can be used to test functions that utilize the
+// LOG(DFATAL) macro using the following style:
+//
+// int DieInDebugOr12(int* sideeffect) {
+//   if (sideeffect) {
+//     *sideeffect = 12;
+//   }
+//   LOG(DFATAL) << "death";
+//   return 12;
+// }
+//
+// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
+//   int sideeffect = 0;
+//   // Only asserts in dbg.
+//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
+//
+// #ifdef NDEBUG
+//   // opt-mode has sideeffect visible.
+//   EXPECT_EQ(12, sideeffect);
+// #else
+//   // dbg-mode no visible sideeffect.
+//   EXPECT_EQ(0, sideeffect);
+// #endif
+// }
+//
+// This will assert that DieInDebugReturn12InOpt() crashes in debug
+// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
+// appropriate fallback value (12 in this case) in opt mode. If you
+// need to test that a function has appropriate side-effects in opt
+// mode, include assertions against the side-effects.  A general
+// pattern for this is:
+//
+// EXPECT_DEBUG_DEATH({
+//   // Side-effects here will have an effect after this statement in
+//   // opt mode, but none in debug mode.
+//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
+// }, "death");
+//
+# ifdef NDEBUG
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+# else
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  ASSERT_DEATH(statement, regex)
+
+# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // GTEST_HAS_DEATH_TEST
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    EXPECT_DEATH(statement, regex)
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    ASSERT_DEATH(statement, regex)
+#else
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
+#endif
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+// This file was GENERATED by command:
+//     pump.py gtest-param-test.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vladl@google.com (Vlad Losev)
+//
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing Framework (Google Test)
+//
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a  summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test case
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_CASE_P(InstantiationName,
+                        FooTest,
+                        Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more then once) the first argument to the
+// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
+// actual test case name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
+// in the given test case, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_CASE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+
+#if !GTEST_OS_SYMBIAN
+# include <utility>
+#endif
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+
+#include <iterator>
+#include <utility>
+#include <vector>
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+// Copyright 2003 Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Dan Egnor (egnor@google.com)
+//
+// A "smart" pointer type with reference tracking.  Every pointer to a
+// particular object is kept on a circular linked list.  When the last pointer
+// to an object is destroyed or reassigned, the object is deleted.
+//
+// Used properly, this deletes the object when the last reference goes away.
+// There are several caveats:
+// - Like all reference counting schemes, cycles lead to leaks.
+// - Each smart pointer is actually two pointers (8 bytes instead of 4).
+// - Every time a pointer is assigned, the entire list of pointers to that
+//   object is traversed.  This class is therefore NOT SUITABLE when there
+//   will often be more than two or three pointers to a particular object.
+// - References are only tracked as long as linked_ptr<> objects are copied.
+//   If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS
+//   will happen (double deletion).
+//
+// A good use of this class is storing object references in STL containers.
+// You can safely put linked_ptr<> in a vector<>.
+// Other uses may not be as good.
+//
+// Note: If you use an incomplete type with linked_ptr<>, the class
+// *containing* linked_ptr<> must have a constructor and destructor (even
+// if they do nothing!).
+//
+// Bill Gibbons suggested we use something like this.
+//
+// Thread Safety:
+//   Unlike other linked_ptr implementations, in this implementation
+//   a linked_ptr object is thread-safe in the sense that:
+//     - it's safe to copy linked_ptr objects concurrently,
+//     - it's safe to copy *from* a linked_ptr and read its underlying
+//       raw pointer (e.g. via get()) concurrently, and
+//     - it's safe to write to two linked_ptrs that point to the same
+//       shared object concurrently.
+// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
+// confusion with normal linked_ptr.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+
+#include <stdlib.h>
+#include <assert.h>
+
+
+namespace testing {
+namespace internal {
+
+// Protects copying of all linked_ptr objects.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// This is used internally by all instances of linked_ptr<>.  It needs to be
+// a non-template class because different types of linked_ptr<> can refer to
+// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)).
+// So, it needs to be possible for different types of linked_ptr to participate
+// in the same circular linked list, so we need a single class type here.
+//
+// DO NOT USE THIS CLASS DIRECTLY YOURSELF.  Use linked_ptr<T>.
+class linked_ptr_internal {
+ public:
+  // Create a new circle that includes only this instance.
+  void join_new() {
+    next_ = this;
+  }
+
+  // Many linked_ptr operations may change p.link_ for some linked_ptr
+  // variable p in the same circle as this object.  Therefore we need
+  // to prevent two such operations from occurring concurrently.
+  //
+  // Note that different types of linked_ptr objects can coexist in a
+  // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and
+  // linked_ptr<Derived2>).  Therefore we must use a single mutex to
+  // protect all linked_ptr objects.  This can create serious
+  // contention in production code, but is acceptable in a testing
+  // framework.
+
+  // Join an existing circle.
+  void join(linked_ptr_internal const* ptr)
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    linked_ptr_internal const* p = ptr;
+    while (p->next_ != ptr) p = p->next_;
+    p->next_ = this;
+    next_ = ptr;
+  }
+
+  // Leave whatever circle we're part of.  Returns true if we were the
+  // last member of the circle.  Once this is done, you can join() another.
+  bool depart()
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    if (next_ == this) return true;
+    linked_ptr_internal const* p = next_;
+    while (p->next_ != this) p = p->next_;
+    p->next_ = next_;
+    return false;
+  }
+
+ private:
+  mutable linked_ptr_internal const* next_;
+};
+
+template <typename T>
+class linked_ptr {
+ public:
+  typedef T element_type;
+
+  // Take over ownership of a raw pointer.  This should happen as soon as
+  // possible after the object is created.
+  explicit linked_ptr(T* ptr = NULL) { capture(ptr); }
+  ~linked_ptr() { depart(); }
+
+  // Copy an existing linked_ptr<>, adding ourselves to the list of references.
+  template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); }
+  linked_ptr(linked_ptr const& ptr) {  // NOLINT
+    assert(&ptr != this);
+    copy(&ptr);
+  }
+
+  // Assignment releases the old value and acquires the new.
+  template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) {
+    depart();
+    copy(&ptr);
+    return *this;
+  }
+
+  linked_ptr& operator=(linked_ptr const& ptr) {
+    if (&ptr != this) {
+      depart();
+      copy(&ptr);
+    }
+    return *this;
+  }
+
+  // Smart pointer members.
+  void reset(T* ptr = NULL) {
+    depart();
+    capture(ptr);
+  }
+  T* get() const { return value_; }
+  T* operator->() const { return value_; }
+  T& operator*() const { return *value_; }
+
+  bool operator==(T* p) const { return value_ == p; }
+  bool operator!=(T* p) const { return value_ != p; }
+  template <typename U>
+  bool operator==(linked_ptr<U> const& ptr) const {
+    return value_ == ptr.get();
+  }
+  template <typename U>
+  bool operator!=(linked_ptr<U> const& ptr) const {
+    return value_ != ptr.get();
+  }
+
+ private:
+  template <typename U>
+  friend class linked_ptr;
+
+  T* value_;
+  linked_ptr_internal link_;
+
+  void depart() {
+    if (link_.depart()) delete value_;
+  }
+
+  void capture(T* ptr) {
+    value_ = ptr;
+    link_.join_new();
+  }
+
+  template <typename U> void copy(linked_ptr<U> const* ptr) {
+    value_ = ptr->get();
+    if (value_)
+      link_.join(&ptr->link_);
+    else
+      link_.join_new();
+  }
+};
+
+template<typename T> inline
+bool operator==(T* ptr, const linked_ptr<T>& x) {
+  return ptr == x.get();
+}
+
+template<typename T> inline
+bool operator!=(T* ptr, const linked_ptr<T>& x) {
+  return ptr != x.get();
+}
+
+// A function to convert T* into linked_ptr<T>
+// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation
+// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg))
+template <typename T>
+linked_ptr<T> make_linked_ptr(T* ptr) {
+  return linked_ptr<T>(ptr);
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// A user can teach this function how to print a class type T by
+// defining either operator<<() or PrintTo() in the namespace that
+// defines T.  More specifically, the FIRST defined function in the
+// following list will be used (assuming T is defined in namespace
+// foo):
+//
+//   1. foo::PrintTo(const T&, ostream*)
+//   2. operator<<(ostream&, const T&) defined in either foo or the
+//      global namespace.
+//
+// If none of the above is defined, it will print the debug string of
+// the value if it is a protocol buffer, or print the raw bytes in the
+// value otherwise.
+//
+// To aid debugging: when T is a reference type, the address of the
+// value is also printed; when T is a (const) char pointer, both the
+// pointer value and the NUL-terminated string it points to are
+// printed.
+//
+// We also provide some convenient wrappers:
+//
+//   // Prints a value to a string.  For a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   std::string ::testing::PrintToString(const T& value);
+//
+//   // Prints a value tersely: for a reference type, the referenced
+//   // value (but not the address) is printed; for a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
+//
+//   // Prints value using the type inferred by the compiler.  The difference
+//   // from UniversalTersePrint() is that this function prints both the
+//   // pointer and the NUL-terminated string for a (const or not) char pointer.
+//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
+//
+//   // Prints the fields of a tuple tersely to a string vector, one
+//   // element for each field. Tuple support must be enabled in
+//   // gtest-port.h.
+//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
+//       const Tuple& value);
+//
+// Known limitation:
+//
+// The print primitives print the elements of an STL-style container
+// using the compiler-inferred type of *iter where iter is a
+// const_iterator of the container.  When const_iterator is an input
+// iterator but not a forward iterator, this inferred type may not
+// match value_type, and the print output may be incorrect.  In
+// practice, this is rarely a problem as for most containers
+// const_iterator is a forward iterator.  We'll fix this if there's an
+// actual need for it.  Note that this fix cannot rely on value_type
+// being defined as many user-defined container types don't have
+// value_type.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace testing {
+
+// Definitions in the 'internal' and 'internal2' name spaces are
+// subject to change without notice.  DO NOT USE THEM IN USER CODE!
+namespace internal2 {
+
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count,
+                                     ::std::ostream* os);
+
+// For selecting which printer to use when a given type has neither <<
+// nor PrintTo().
+enum TypeKind {
+  kProtobuf,              // a protobuf type
+  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
+                          // (e.g. a named or unnamed enum type)
+  kOtherType              // anything else
+};
+
+// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
+// by the universal printer to print a value of type T when neither
+// operator<< nor PrintTo() is defined for T, where kTypeKind is the
+// "kind" of T as defined by enum TypeKind.
+template <typename T, TypeKind kTypeKind>
+class TypeWithoutFormatter {
+ public:
+  // This default version is called when kTypeKind is kOtherType.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
+                         sizeof(value), os);
+  }
+};
+
+// We print a protobuf using its ShortDebugString() when the string
+// doesn't exceed this many characters; otherwise we print it using
+// DebugString() for better readability.
+const size_t kProtobufOneLinerMaxLength = 50;
+
+template <typename T>
+class TypeWithoutFormatter<T, kProtobuf> {
+ public:
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const ::testing::internal::string short_str = value.ShortDebugString();
+    const ::testing::internal::string pretty_str =
+        short_str.length() <= kProtobufOneLinerMaxLength ?
+        short_str : ("\n" + value.DebugString());
+    *os << ("<" + pretty_str + ">");
+  }
+};
+
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToInteger> {
+ public:
+  // Since T has no << operator or PrintTo() but can be implicitly
+  // converted to BiggestInt, we print it as a BiggestInt.
+  //
+  // Most likely T is an enum type (either named or unnamed), in which
+  // case printing it as an integer is the desired behavior.  In case
+  // T is not an enum, printing it as an integer is the best we can do
+  // given that it has no user-defined printer.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const internal::BiggestInt kBigInt = value;
+    *os << kBigInt;
+  }
+};
+
+// Prints the given value to the given ostream.  If the value is a
+// protocol message, its debug string is printed; if it's an enum or
+// of a type implicitly convertible to BiggestInt, it's printed as an
+// integer; otherwise the bytes in the value are printed.  This is
+// what UniversalPrinter<T>::Print() does when it knows nothing about
+// type T and T has neither << operator nor PrintTo().
+//
+// A user can override this behavior for a class type Foo by defining
+// a << operator in the namespace where Foo is defined.
+//
+// We put this operator in namespace 'internal2' instead of 'internal'
+// to simplify the implementation, as much code in 'internal' needs to
+// use << in STL, which would conflict with our own << were it defined
+// in 'internal'.
+//
+// Note that this operator<< takes a generic std::basic_ostream<Char,
+// CharTraits> type instead of the more restricted std::ostream.  If
+// we define it to take an std::ostream instead, we'll get an
+// "ambiguous overloads" compiler error when trying to print a type
+// Foo that supports streaming to std::basic_ostream<Char,
+// CharTraits>, as the compiler cannot tell whether
+// operator<<(std::ostream&, const T&) or
+// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
+// specific.
+template <typename Char, typename CharTraits, typename T>
+::std::basic_ostream<Char, CharTraits>& operator<<(
+    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
+  TypeWithoutFormatter<T,
+      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
+       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
+       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
+  return os;
+}
+
+}  // namespace internal2
+}  // namespace testing
+
+// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
+// magic needed for implementing UniversalPrinter won't work.
+namespace testing_internal {
+
+// Used to print a value that is not an STL-style container when the
+// user doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
+  // With the following statement, during unqualified name lookup,
+  // testing::internal2::operator<< appears as if it was declared in
+  // the nearest enclosing namespace that contains both
+  // ::testing_internal and ::testing::internal2, i.e. the global
+  // namespace.  For more details, refer to the C++ Standard section
+  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
+  // testing::internal2::operator<< in case T doesn't come with a <<
+  // operator.
+  //
+  // We cannot write 'using ::testing::internal2::operator<<;', which
+  // gcc 3.3 fails to compile due to a compiler bug.
+  using namespace ::testing::internal2;  // NOLINT
+
+  // Assuming T is defined in namespace foo, in the next statement,
+  // the compiler will consider all of:
+  //
+  //   1. foo::operator<< (thanks to Koenig look-up),
+  //   2. ::operator<< (as the current namespace is enclosed in ::),
+  //   3. testing::internal2::operator<< (thanks to the using statement above).
+  //
+  // The operator<< whose type matches T best will be picked.
+  //
+  // We deliberately allow #2 to be a candidate, as sometimes it's
+  // impossible to define #1 (e.g. when foo is ::std, defining
+  // anything in it is undefined behavior unless you are a compiler
+  // vendor.).
+  *os << value;
+}
+
+}  // namespace testing_internal
+
+namespace testing {
+namespace internal {
+
+// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
+// value to the given ostream.  The caller must ensure that
+// 'ostream_ptr' is not NULL, or the behavior is undefined.
+//
+// We define UniversalPrinter as a class template (as opposed to a
+// function template), as we need to partially specialize it for
+// reference types, which cannot be done with function templates.
+template <typename T>
+class UniversalPrinter;
+
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+template <typename C>
+void DefaultPrintTo(IsContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const C& container, ::std::ostream* os) {
+  const size_t kMaxCount = 32;  // The maximum number of elements to print.
+  *os << '{';
+  size_t count = 0;
+  for (typename C::const_iterator it = container.begin();
+       it != container.end(); ++it, ++count) {
+    if (count > 0) {
+      *os << ',';
+      if (count == kMaxCount) {  // Enough has been printed.
+        *os << " ...";
+        break;
+      }
+    }
+    *os << ' ';
+    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
+    // handle *it being a native array.
+    internal::UniversalPrint(*it, os);
+  }
+
+  if (count > 0) {
+    *os << ' ';
+  }
+  *os << '}';
+}
+
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    true_type /* is a pointer */,
+                    T* p, ::std::ostream* os) {
+  if (p == NULL) {
+    *os << "NULL";
+  } else {
+    // C++ doesn't allow casting from a function pointer to any object
+    // pointer.
+    //
+    // IsTrue() silences warnings: "Condition is always true",
+    // "unreachable code".
+    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
+      // T is not a function type.  We just call << to print p,
+      // relying on ADL to pick up user-defined << for their pointer
+      // types, if any.
+      *os << p;
+    } else {
+      // T is a function type, so '*os << p' doesn't do what we want
+      // (it just prints p as bool).  We want to print p as a const
+      // void*.  However, we cannot cast it to const void* directly,
+      // even using reinterpret_cast, as earlier versions of gcc
+      // (e.g. 3.4.5) cannot compile the cast when p is a function
+      // pointer.  Casting to UInt64 first solves the problem.
+      *os << reinterpret_cast<const void*>(
+          reinterpret_cast<internal::UInt64>(p));
+    }
+  }
+}
+
+// Used to print a non-container, non-pointer value when the user
+// doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const T& value, ::std::ostream* os) {
+  ::testing_internal::DefaultPrintNonContainerTo(value, os);
+}
+
+// Prints the given value using the << operator if it has one;
+// otherwise prints the bytes in it.  This is what
+// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
+// or overloaded for type T.
+//
+// A user can override this behavior for a class type Foo by defining
+// an overload of PrintTo() in the namespace where Foo is defined.  We
+// give the user this option as sometimes defining a << operator for
+// Foo is not desirable (e.g. the coding style may prevent doing it,
+// or there is already a << operator but it doesn't do what the user
+// wants).
+template <typename T>
+void PrintTo(const T& value, ::std::ostream* os) {
+  // DefaultPrintTo() is overloaded.  The type of its first two
+  // arguments determine which version will be picked.  If T is an
+  // STL-style container, the version for container will be called; if
+  // T is a pointer, the pointer version will be called; otherwise the
+  // generic version will be called.
+  //
+  // Note that we check for container types here, prior to we check
+  // for protocol message types in our operator<<.  The rationale is:
+  //
+  // For protocol messages, we want to give people a chance to
+  // override Google Mock's format by defining a PrintTo() or
+  // operator<<.  For STL containers, other formats can be
+  // incompatible with Google Mock's format for the container
+  // elements; therefore we check for container types here to ensure
+  // that our format is used.
+  //
+  // The second argument of DefaultPrintTo() is needed to bypass a bug
+  // in Symbian's C++ compiler that prevents it from picking the right
+  // overload between:
+  //
+  //   PrintTo(const T& x, ...);
+  //   PrintTo(T* x, ...);
+  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
+}
+
+// The following list of PrintTo() overloads tells
+// UniversalPrinter<T>::Print() how to print standard types (built-in
+// types, strings, plain arrays, and pointers).
+
+// Overloads for various char types.
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
+inline void PrintTo(char c, ::std::ostream* os) {
+  // When printing a plain char, we always treat it as unsigned.  This
+  // way, the output won't be affected by whether the compiler thinks
+  // char is signed or not.
+  PrintTo(static_cast<unsigned char>(c), os);
+}
+
+// Overloads for other simple built-in types.
+inline void PrintTo(bool x, ::std::ostream* os) {
+  *os << (x ? "true" : "false");
+}
+
+// Overload for wchar_t type.
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its decimal code (except for L'\0').
+// The L'\0' char is printed as "L'\\0'". The decimal code is printed
+// as signed integer when wchar_t is implemented by the compiler
+// as a signed type and is printed as an unsigned integer when wchar_t
+// is implemented as an unsigned type.
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+
+// Overloads for C strings.
+GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
+inline void PrintTo(char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char*>(s), os);
+}
+
+// signed/unsigned char is often used for representing binary data, so
+// we print pointers to it as void* to be safe.
+inline void PrintTo(const signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+
+// MSVC can be configured to define wchar_t as a typedef of unsigned
+// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
+// type.  When wchar_t is a typedef, defining an overload for const
+// wchar_t* would cause unsigned short* be printed as a wide string,
+// possibly causing invalid memory accesses.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Overloads for wide C strings
+GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
+inline void PrintTo(wchar_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
+}
+#endif
+
+// Overload for C arrays.  Multi-dimensional arrays are printed
+// properly.
+
+// Prints the given number of elements in an array, without printing
+// the curly braces.
+template <typename T>
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
+  UniversalPrint(a[0], os);
+  for (size_t i = 1; i != count; i++) {
+    *os << ", ";
+    UniversalPrint(a[i], os);
+  }
+}
+
+// Overloads for ::string and ::std::string.
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os);
+inline void PrintTo(const ::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+
+// Overloads for ::wstring and ::std::wstring.
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_TR1_TUPLE
+// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
+// which are packed as tuples.
+
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T& t, ::std::ostream* os);
+
+// Overloaded PrintTo() for tuples of various arities.  We support
+// tuples of up-to 10 fields.  The following implementation works
+// regardless of whether tr1::tuple is implemented using the
+// non-standard variadic template feature or not.
+
+inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1>
+void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2>
+void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9, typename T10>
+void PrintTo(
+    const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t,
+    ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Overload for std::pair.
+template <typename T1, typename T2>
+void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
+  *os << '(';
+  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
+  // a reference type.  The same for printing value.second.
+  UniversalPrinter<T1>::Print(value.first, os);
+  *os << ", ";
+  UniversalPrinter<T2>::Print(value.second, os);
+  *os << ')';
+}
+
+// Implements printing a non-reference type T by letting the compiler
+// pick the right overload of PrintTo() for T.
+template <typename T>
+class UniversalPrinter {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4180)  // Temporarily disables warning 4180.
+#endif  // _MSC_VER
+
+  // Note: we deliberately don't call this PrintTo(), as that name
+  // conflicts with ::testing::internal::PrintTo in the body of the
+  // function.
+  static void Print(const T& value, ::std::ostream* os) {
+    // By default, ::testing::internal::PrintTo() is used for printing
+    // the value.
+    //
+    // Thanks to Koenig look-up, if T is a class and has its own
+    // PrintTo() function defined in its namespace, that function will
+    // be visible here.  Since it is more specific than the generic ones
+    // in ::testing::internal, it will be picked by the compiler in the
+    // following statement - exactly what we want.
+    PrintTo(value, os);
+  }
+
+#ifdef _MSC_VER
+# pragma warning(pop)           // Restores the warning state.
+#endif  // _MSC_VER
+};
+
+// UniversalPrintArray(begin, len, os) prints an array of 'len'
+// elements, starting at address 'begin'.
+template <typename T>
+void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
+  if (len == 0) {
+    *os << "{}";
+  } else {
+    *os << "{ ";
+    const size_t kThreshold = 18;
+    const size_t kChunkSize = 8;
+    // If the array has more than kThreshold elements, we'll have to
+    // omit some details by printing only the first and the last
+    // kChunkSize elements.
+    // TODO(wan@google.com): let the user control the threshold using a flag.
+    if (len <= kThreshold) {
+      PrintRawArrayTo(begin, len, os);
+    } else {
+      PrintRawArrayTo(begin, kChunkSize, os);
+      *os << ", ..., ";
+      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
+    }
+    *os << " }";
+  }
+}
+// This overload prints a (const) char array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const char* begin, size_t len, ::std::ostream* os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const wchar_t* begin, size_t len, ::std::ostream* os);
+
+// Implements printing an array type T[N].
+template <typename T, size_t N>
+class UniversalPrinter<T[N]> {
+ public:
+  // Prints the given array, omitting some elements when there are too
+  // many.
+  static void Print(const T (&a)[N], ::std::ostream* os) {
+    UniversalPrintArray(a, N, os);
+  }
+};
+
+// Implements printing a reference type T&.
+template <typename T>
+class UniversalPrinter<T&> {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4180)  // Temporarily disables warning 4180.
+#endif  // _MSC_VER
+
+  static void Print(const T& value, ::std::ostream* os) {
+    // Prints the address of the value.  We use reinterpret_cast here
+    // as static_cast doesn't compile when T is a function type.
+    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
+
+    // Then prints the value itself.
+    UniversalPrint(value, os);
+  }
+
+#ifdef _MSC_VER
+# pragma warning(pop)           // Restores the warning state.
+#endif  // _MSC_VER
+};
+
+// Prints a value tersely: for a reference type, the referenced value
+// (but not the address) is printed; for a (const) char pointer, the
+// NUL-terminated string (but not the pointer) is printed.
+
+template <typename T>
+class UniversalTersePrinter {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T>
+class UniversalTersePrinter<T&> {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+  static void Print(const T (&value)[N], ::std::ostream* os) {
+    UniversalPrinter<T[N]>::Print(value, os);
+  }
+};
+template <>
+class UniversalTersePrinter<const char*> {
+ public:
+  static void Print(const char* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char*> {
+ public:
+  static void Print(char* str, ::std::ostream* os) {
+    UniversalTersePrinter<const char*>::Print(str, os);
+  }
+};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t*> {
+ public:
+  static void Print(const wchar_t* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::wstring(str), os);
+    }
+  }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t*> {
+ public:
+  static void Print(wchar_t* str, ::std::ostream* os) {
+    UniversalTersePrinter<const wchar_t*>::Print(str, os);
+  }
+};
+
+template <typename T>
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
+  UniversalTersePrinter<T>::Print(value, os);
+}
+
+// Prints a value using the type inferred by the compiler.  The
+// difference between this and UniversalTersePrint() is that for a
+// (const) char pointer, this prints both the pointer and the
+// NUL-terminated string.
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os) {
+  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+  // UniversalPrinter with T directly.
+  typedef T T1;
+  UniversalPrinter<T1>::Print(value, os);
+}
+
+#if GTEST_HAS_TR1_TUPLE
+typedef ::std::vector<string> Strings;
+
+// This helper template allows PrintTo() for tuples and
+// UniversalTersePrintTupleFieldsToStrings() to be defined by
+// induction on the number of tuple fields.  The idea is that
+// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
+// fields in tuple t, and can be defined in terms of
+// TuplePrefixPrinter<N - 1>.
+
+// The inductive case.
+template <size_t N>
+struct TuplePrefixPrinter {
+  // Prints the first N fields of a tuple.
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
+    TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
+    *os << ", ";
+    UniversalPrinter<typename ::std::tr1::tuple_element<N - 1, Tuple>::type>
+        ::Print(::std::tr1::get<N - 1>(t), os);
+  }
+
+  // Tersely prints the first N fields of a tuple to a string vector,
+  // one element for each field.
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
+    TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
+    ::std::stringstream ss;
+    UniversalTersePrint(::std::tr1::get<N - 1>(t), &ss);
+    strings->push_back(ss.str());
+  }
+};
+
+// Base cases.
+template <>
+struct TuplePrefixPrinter<0> {
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple&, ::std::ostream*) {}
+
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
+};
+// We have to specialize the entire TuplePrefixPrinter<> class
+// template here, even though the definition of
+// TersePrintPrefixToStrings() is the same as the generic version, as
+// Embarcadero (formerly CodeGear, formerly Borland) C++ doesn't
+// support specializing a method template of a class template.
+template <>
+struct TuplePrefixPrinter<1> {
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
+    UniversalPrinter<typename ::std::tr1::tuple_element<0, Tuple>::type>::
+        Print(::std::tr1::get<0>(t), os);
+  }
+
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
+    ::std::stringstream ss;
+    UniversalTersePrint(::std::tr1::get<0>(t), &ss);
+    strings->push_back(ss.str());
+  }
+};
+
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T& t, ::std::ostream* os) {
+  *os << "(";
+  TuplePrefixPrinter< ::std::tr1::tuple_size<T>::value>::
+      PrintPrefixTo(t, os);
+  *os << ")";
+}
+
+// Prints the fields of a tuple tersely to a string vector, one
+// element for each field.  See the comment before
+// UniversalTersePrint() for how we define "tersely".
+template <typename Tuple>
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
+  Strings result;
+  TuplePrefixPrinter< ::std::tr1::tuple_size<Tuple>::value>::
+      TersePrintPrefixToStrings(value, &result);
+  return result;
+}
+#endif  // GTEST_HAS_TR1_TUPLE
+
+}  // namespace internal
+
+template <typename T>
+::std::string PrintToString(const T& value) {
+  ::std::stringstream ss;
+  internal::UniversalTersePrinter<T>::Print(value, &ss);
+  return ss.str();
+}
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Outputs a message explaining invalid registration of different
+// fixture class for the same test case. This may happen when
+// TEST_P macro is used to define two tests with the same name
+// but in different namespaces.
+GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
+                                          const char* file, int line);
+
+template <typename> class ParamGeneratorInterface;
+template <typename> class ParamGenerator;
+
+// Interface for iterating over elements provided by an implementation
+// of ParamGeneratorInterface<T>.
+template <typename T>
+class ParamIteratorInterface {
+ public:
+  virtual ~ParamIteratorInterface() {}
+  // A pointer to the base generator instance.
+  // Used only for the purposes of iterator comparison
+  // to make sure that two iterators belong to the same generator.
+  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
+  // Advances iterator to point to the next element
+  // provided by the generator. The caller is responsible
+  // for not calling Advance() on an iterator equal to
+  // BaseGenerator()->End().
+  virtual void Advance() = 0;
+  // Clones the iterator object. Used for implementing copy semantics
+  // of ParamIterator<T>.
+  virtual ParamIteratorInterface* Clone() const = 0;
+  // Dereferences the current iterator and provides (read-only) access
+  // to the pointed value. It is the caller's responsibility not to call
+  // Current() on an iterator equal to BaseGenerator()->End().
+  // Used for implementing ParamGenerator<T>::operator*().
+  virtual const T* Current() const = 0;
+  // Determines whether the given iterator and other point to the same
+  // element in the sequence generated by the generator.
+  // Used for implementing ParamGenerator<T>::operator==().
+  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
+};
+
+// Class iterating over elements provided by an implementation of
+// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
+// and implements the const forward iterator concept.
+template <typename T>
+class ParamIterator {
+ public:
+  typedef T value_type;
+  typedef const T& reference;
+  typedef ptrdiff_t difference_type;
+
+  // ParamIterator assumes ownership of the impl_ pointer.
+  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
+  ParamIterator& operator=(const ParamIterator& other) {
+    if (this != &other)
+      impl_.reset(other.impl_->Clone());
+    return *this;
+  }
+
+  const T& operator*() const { return *impl_->Current(); }
+  const T* operator->() const { return impl_->Current(); }
+  // Prefix version of operator++.
+  ParamIterator& operator++() {
+    impl_->Advance();
+    return *this;
+  }
+  // Postfix version of operator++.
+  ParamIterator operator++(int /*unused*/) {
+    ParamIteratorInterface<T>* clone = impl_->Clone();
+    impl_->Advance();
+    return ParamIterator(clone);
+  }
+  bool operator==(const ParamIterator& other) const {
+    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
+  }
+  bool operator!=(const ParamIterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  friend class ParamGenerator<T>;
+  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
+  scoped_ptr<ParamIteratorInterface<T> > impl_;
+};
+
+// ParamGeneratorInterface<T> is the binary interface to access generators
+// defined in other translation units.
+template <typename T>
+class ParamGeneratorInterface {
+ public:
+  typedef T ParamType;
+
+  virtual ~ParamGeneratorInterface() {}
+
+  // Generator interface definition
+  virtual ParamIteratorInterface<T>* Begin() const = 0;
+  virtual ParamIteratorInterface<T>* End() const = 0;
+};
+
+// Wraps ParamGeneratorInterface<T> and provides general generator syntax
+// compatible with the STL Container concept.
+// This class implements copy initialization semantics and the contained
+// ParamGeneratorInterface<T> instance is shared among all copies
+// of the original object. This is possible because that instance is immutable.
+template<typename T>
+class ParamGenerator {
+ public:
+  typedef ParamIterator<T> iterator;
+
+  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
+
+  ParamGenerator& operator=(const ParamGenerator& other) {
+    impl_ = other.impl_;
+    return *this;
+  }
+
+  iterator begin() const { return iterator(impl_->Begin()); }
+  iterator end() const { return iterator(impl_->End()); }
+
+ private:
+  linked_ptr<const ParamGeneratorInterface<T> > impl_;
+};
+
+// Generates values from a range of two comparable values. Can be used to
+// generate sequences of user-defined types that implement operator+() and
+// operator<().
+// This class is used in the Range() function.
+template <typename T, typename IncrementT>
+class RangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  RangeGenerator(T begin, T end, IncrementT step)
+      : begin_(begin), end_(end),
+        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+  virtual ~RangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, begin_, 0, step_);
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, end_, end_index_, step_);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
+             IncrementT step)
+        : base_(base), value_(value), index_(index), step_(step) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      value_ = value_ + step_;
+      index_++;
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const T* Current() const { return &value_; }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const int other_index =
+          CheckedDowncastToActualType<const Iterator>(&other)->index_;
+      return index_ == other_index;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : ParamIteratorInterface<T>(),
+          base_(other.base_), value_(other.value_), index_(other.index_),
+          step_(other.step_) {}
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<T>* const base_;
+    T value_;
+    int index_;
+    const IncrementT step_;
+  };  // class RangeGenerator::Iterator
+
+  static int CalculateEndIndex(const T& begin,
+                               const T& end,
+                               const IncrementT& step) {
+    int end_index = 0;
+    for (T i = begin; i < end; i = i + step)
+      end_index++;
+    return end_index;
+  }
+
+  // No implementation - assignment is unsupported.
+  void operator=(const RangeGenerator& other);
+
+  const T begin_;
+  const T end_;
+  const IncrementT step_;
+  // The index for the end() iterator. All the elements in the generated
+  // sequence are indexed (0-based) to aid iterator comparison.
+  const int end_index_;
+};  // class RangeGenerator
+
+
+// Generates values from a pair of STL-style iterators. Used in the
+// ValuesIn() function. The elements are copied from the source range
+// since the source can be located on the stack, and the generator
+// is likely to persist beyond that stack frame.
+template <typename T>
+class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  template <typename ForwardIterator>
+  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
+      : container_(begin, end) {}
+  virtual ~ValuesInIteratorRangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, container_.begin());
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, container_.end());
+  }
+
+ private:
+  typedef typename ::std::vector<T> ContainerType;
+
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base,
+             typename ContainerType::const_iterator iterator)
+        : base_(base), iterator_(iterator) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      ++iterator_;
+      value_.reset();
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    // We need to use cached value referenced by iterator_ because *iterator_
+    // can return a temporary object (and of type other then T), so just
+    // having "return &*iterator_;" doesn't work.
+    // value_ is updated here and not in Advance() because Advance()
+    // can advance iterator_ beyond the end of the range, and we cannot
+    // detect that fact. The client code, on the other hand, is
+    // responsible for not calling Current() on an out-of-range iterator.
+    virtual const T* Current() const {
+      if (value_.get() == NULL)
+        value_.reset(new T(*iterator_));
+      return value_.get();
+    }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      return iterator_ ==
+          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+          // The explicit constructor call suppresses a false warning
+          // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(),
+          base_(other.base_),
+          iterator_(other.iterator_) {}
+
+    const ParamGeneratorInterface<T>* const base_;
+    typename ContainerType::const_iterator iterator_;
+    // A cached value of *iterator_. We keep it here to allow access by
+    // pointer in the wrapping iterator's operator->().
+    // value_ needs to be mutable to be accessed in Current().
+    // Use of scoped_ptr helps manage cached value's lifetime,
+    // which is bound by the lifespan of the iterator itself.
+    mutable scoped_ptr<const T> value_;
+  };  // class ValuesInIteratorRangeGenerator::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const ValuesInIteratorRangeGenerator& other);
+
+  const ContainerType container_;
+};  // class ValuesInIteratorRangeGenerator
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Stores a parameter value and later creates tests parameterized with that
+// value.
+template <class TestClass>
+class ParameterizedTestFactory : public TestFactoryBase {
+ public:
+  typedef typename TestClass::ParamType ParamType;
+  explicit ParameterizedTestFactory(ParamType parameter) :
+      parameter_(parameter) {}
+  virtual Test* CreateTest() {
+    TestClass::SetParam(&parameter_);
+    return new TestClass();
+  }
+
+ private:
+  const ParamType parameter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactoryBase is a base class for meta-factories that create
+// test factories for passing into MakeAndRegisterTestInfo function.
+template <class ParamType>
+class TestMetaFactoryBase {
+ public:
+  virtual ~TestMetaFactoryBase() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactory creates test factories for passing into
+// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
+// ownership of test factory pointer, same factory object cannot be passed
+// into that method twice. But ParameterizedTestCaseInfo is going to call
+// it for each Test/Parameter value combination. Thus it needs meta factory
+// creator class.
+template <class TestCase>
+class TestMetaFactory
+    : public TestMetaFactoryBase<typename TestCase::ParamType> {
+ public:
+  typedef typename TestCase::ParamType ParamType;
+
+  TestMetaFactory() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
+    return new ParameterizedTestFactory<TestCase>(parameter);
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfoBase is a generic interface
+// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase
+// accumulates test information provided by TEST_P macro invocations
+// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations
+// and uses that information to register all resulting test instances
+// in RegisterTests method. The ParameterizeTestCaseRegistry class holds
+// a collection of pointers to the ParameterizedTestCaseInfo objects
+// and calls RegisterTests() on each of them when asked.
+class ParameterizedTestCaseInfoBase {
+ public:
+  virtual ~ParameterizedTestCaseInfoBase() {}
+
+  // Base part of test case name for display purposes.
+  virtual const string& GetTestCaseName() const = 0;
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const = 0;
+  // UnitTest class invokes this method to register tests in this
+  // test case right before running them in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  virtual void RegisterTests() = 0;
+
+ protected:
+  ParameterizedTestCaseInfoBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test case and generators
+// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that
+// test case. It registers tests with all values generated by all
+// generators when asked.
+template <class TestCase>
+class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
+ public:
+  // ParamType and GeneratorCreationFunc are private types but are required
+  // for declarations of public methods AddTestPattern() and
+  // AddTestCaseInstantiation().
+  typedef typename TestCase::ParamType ParamType;
+  // A function that returns an instance of appropriate generator type.
+  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+
+  explicit ParameterizedTestCaseInfo(const char* name)
+      : test_case_name_(name) {}
+
+  // Test case base name for display purposes.
+  virtual const string& GetTestCaseName() const { return test_case_name_; }
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
+  // TEST_P macro uses AddTestPattern() to record information
+  // about a single test in a LocalTestInfo structure.
+  // test_case_name is the base name of the test case (without invocation
+  // prefix). test_base_name is the name of an individual test without
+  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
+  // test case base name and DoBar is test base name.
+  void AddTestPattern(const char* test_case_name,
+                      const char* test_base_name,
+                      TestMetaFactoryBase<ParamType>* meta_factory) {
+    tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name,
+                                                       test_base_name,
+                                                       meta_factory)));
+  }
+  // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
+  // about a generator.
+  int AddTestCaseInstantiation(const string& instantiation_name,
+                               GeneratorCreationFunc* func,
+                               const char* /* file */,
+                               int /* line */) {
+    instantiations_.push_back(::std::make_pair(instantiation_name, func));
+    return 0;  // Return value used only to run this method in namespace scope.
+  }
+  // UnitTest class invokes this method to register tests in this test case
+  // test cases right before running tests in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  // UnitTest has a guard to prevent from calling this method more then once.
+  virtual void RegisterTests() {
+    for (typename TestInfoContainer::iterator test_it = tests_.begin();
+         test_it != tests_.end(); ++test_it) {
+      linked_ptr<TestInfo> test_info = *test_it;
+      for (typename InstantiationContainer::iterator gen_it =
+               instantiations_.begin(); gen_it != instantiations_.end();
+               ++gen_it) {
+        const string& instantiation_name = gen_it->first;
+        ParamGenerator<ParamType> generator((*gen_it->second)());
+
+        string test_case_name;
+        if ( !instantiation_name.empty() )
+          test_case_name = instantiation_name + "/";
+        test_case_name += test_info->test_case_base_name;
+
+        int i = 0;
+        for (typename ParamGenerator<ParamType>::iterator param_it =
+                 generator.begin();
+             param_it != generator.end(); ++param_it, ++i) {
+          Message test_name_stream;
+          test_name_stream << test_info->test_base_name << "/" << i;
+          MakeAndRegisterTestInfo(
+              test_case_name.c_str(),
+              test_name_stream.GetString().c_str(),
+              NULL,  // No type parameter.
+              PrintToString(*param_it).c_str(),
+              GetTestCaseTypeId(),
+              TestCase::SetUpTestCase,
+              TestCase::TearDownTestCase,
+              test_info->test_meta_factory->CreateTestFactory(*param_it));
+        }  // for param_it
+      }  // for gen_it
+    }  // for test_it
+  }  // RegisterTests
+
+ private:
+  // LocalTestInfo structure keeps information about a single test registered
+  // with TEST_P macro.
+  struct TestInfo {
+    TestInfo(const char* a_test_case_base_name,
+             const char* a_test_base_name,
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory) :
+        test_case_base_name(a_test_case_base_name),
+        test_base_name(a_test_base_name),
+        test_meta_factory(a_test_meta_factory) {}
+
+    const string test_case_base_name;
+    const string test_base_name;
+    const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+  };
+  typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
+  // Keeps pairs of <Instantiation name, Sequence generator creation function>
+  // received from INSTANTIATE_TEST_CASE_P macros.
+  typedef ::std::vector<std::pair<string, GeneratorCreationFunc*> >
+      InstantiationContainer;
+
+  const string test_case_name_;
+  TestInfoContainer tests_;
+  InstantiationContainer instantiations_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo);
+};  // class ParameterizedTestCaseInfo
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase
+// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P
+// macros use it to locate their corresponding ParameterizedTestCaseInfo
+// descriptors.
+class ParameterizedTestCaseRegistry {
+ public:
+  ParameterizedTestCaseRegistry() {}
+  ~ParameterizedTestCaseRegistry() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      delete *it;
+    }
+  }
+
+  // Looks up or creates and returns a structure containing information about
+  // tests and instantiations of a particular test case.
+  template <class TestCase>
+  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
+      const char* test_case_name,
+      const char* file,
+      int line) {
+    ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      if ((*it)->GetTestCaseName() == test_case_name) {
+        if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) {
+          // Complain about incorrect usage of Google Test facilities
+          // and terminate the program since we cannot guaranty correct
+          // test case setup and tear-down in this case.
+          ReportInvalidTestCaseType(test_case_name,  file, line);
+          posix::Abort();
+        } else {
+          // At this point we are sure that the object we found is of the same
+          // type we are looking for, so we downcast it to that type
+          // without further checks.
+          typed_test_info = CheckedDowncastToActualType<
+              ParameterizedTestCaseInfo<TestCase> >(*it);
+        }
+        break;
+      }
+    }
+    if (typed_test_info == NULL) {
+      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(test_case_name);
+      test_case_infos_.push_back(typed_test_info);
+    }
+    return typed_test_info;
+  }
+  void RegisterTests() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      (*it)->RegisterTests();
+    }
+  }
+
+ private:
+  typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer;
+
+  TestCaseInfoContainer test_case_infos_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+// This file was GENERATED by command:
+//     pump.py gtest-param-util-generated.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently Google Test supports at most 50 arguments in Values,
+// and at most 10 arguments in Combine. Please contact
+// googletestframework@googlegroups.com if you need more.
+// Please note that the number of arguments to Combine is limited
+// by the maximum arity of the implementation of tr1::tuple which is
+// currently set at 10.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end);
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container);
+
+namespace internal {
+
+// Used in the Values() function to provide polymorphic capabilities.
+template <typename T1>
+class ValueArray1 {
+ public:
+  explicit ValueArray1(T1 v1) : v1_(v1) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const { return ValuesIn(&v1_, &v1_ + 1); }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray1& other);
+
+  const T1 v1_;
+};
+
+template <typename T1, typename T2>
+class ValueArray2 {
+ public:
+  ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray2& other);
+
+  const T1 v1_;
+  const T2 v2_;
+};
+
+template <typename T1, typename T2, typename T3>
+class ValueArray3 {
+ public:
+  ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray3& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+class ValueArray4 {
+ public:
+  ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray4& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class ValueArray5 {
+ public:
+  ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray5& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class ValueArray6 {
+ public:
+  ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray6& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class ValueArray7 {
+ public:
+  ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray7& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class ValueArray8 {
+ public:
+  ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+      T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray8& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class ValueArray9 {
+ public:
+  ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+      T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray9& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class ValueArray10 {
+ public:
+  ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray10& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+class ValueArray11 {
+ public:
+  ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray11& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+class ValueArray12 {
+ public:
+  ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray12& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+class ValueArray13 {
+ public:
+  ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray13& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+class ValueArray14 {
+ public:
+  ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray14& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+class ValueArray15 {
+ public:
+  ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray15& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+class ValueArray16 {
+ public:
+  ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray16& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+class ValueArray17 {
+ public:
+  ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+      T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray17& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+class ValueArray18 {
+ public:
+  ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray18& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+class ValueArray19 {
+ public:
+  ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray19& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+class ValueArray20 {
+ public:
+  ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray20& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+class ValueArray21 {
+ public:
+  ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray21& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+class ValueArray22 {
+ public:
+  ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray22& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+class ValueArray23 {
+ public:
+  ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray23& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+class ValueArray24 {
+ public:
+  ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray24& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+class ValueArray25 {
+ public:
+  ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+      T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray25& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+class ValueArray26 {
+ public:
+  ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray26& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+class ValueArray27 {
+ public:
+  ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray27& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+class ValueArray28 {
+ public:
+  ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray28& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+class ValueArray29 {
+ public:
+  ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray29& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+class ValueArray30 {
+ public:
+  ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray30& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+class ValueArray31 {
+ public:
+  ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray31& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+class ValueArray32 {
+ public:
+  ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray32& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+class ValueArray33 {
+ public:
+  ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+      T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray33& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+class ValueArray34 {
+ public:
+  ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray34& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+class ValueArray35 {
+ public:
+  ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray35& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+class ValueArray36 {
+ public:
+  ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray36& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+class ValueArray37 {
+ public:
+  ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray37& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+class ValueArray38 {
+ public:
+  ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray38& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+class ValueArray39 {
+ public:
+  ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray39& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+class ValueArray40 {
+ public:
+  ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray40& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+class ValueArray41 {
+ public:
+  ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+      T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray41& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+class ValueArray42 {
+ public:
+  ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray42& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+class ValueArray43 {
+ public:
+  ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37),
+      v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray43& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+class ValueArray44 {
+ public:
+  ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36),
+      v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42),
+      v43_(v43), v44_(v44) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray44& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+class ValueArray45 {
+ public:
+  ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41),
+      v42_(v42), v43_(v43), v44_(v44), v45_(v45) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray45& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+class ValueArray46 {
+ public:
+  ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray46& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+class ValueArray47 {
+ public:
+  ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46),
+      v47_(v47) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray47& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+class ValueArray48 {
+ public:
+  ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45),
+      v46_(v46), v47_(v47), v48_(v48) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray48& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+class ValueArray49 {
+ public:
+  ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48,
+      T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray49& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+class ValueArray50 {
+ public:
+  ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49,
+      T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray50& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+  const T50 v50_;
+};
+
+# if GTEST_HAS_COMBINE
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Generates values from the Cartesian product of values produced
+// by the argument generators.
+//
+template <typename T1, typename T2>
+class CartesianProductGenerator2
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2> ParamType;
+
+  CartesianProductGenerator2(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2)
+      : g1_(g1), g2_(g2) {}
+  virtual ~CartesianProductGenerator2() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current2_;
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator2::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator2& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+};  // class CartesianProductGenerator2
+
+
+template <typename T1, typename T2, typename T3>
+class CartesianProductGenerator3
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3> ParamType;
+
+  CartesianProductGenerator3(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  virtual ~CartesianProductGenerator3() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current3_;
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator3::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator3& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+};  // class CartesianProductGenerator3
+
+
+template <typename T1, typename T2, typename T3, typename T4>
+class CartesianProductGenerator4
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4> ParamType;
+
+  CartesianProductGenerator4(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  virtual ~CartesianProductGenerator4() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current4_;
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator4::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator4& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+};  // class CartesianProductGenerator4
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class CartesianProductGenerator5
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5> ParamType;
+
+  CartesianProductGenerator5(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  virtual ~CartesianProductGenerator5() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current5_;
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator5::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator5& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+};  // class CartesianProductGenerator5
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class CartesianProductGenerator6
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5,
+        T6> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> ParamType;
+
+  CartesianProductGenerator6(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  virtual ~CartesianProductGenerator6() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current6_;
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator6::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator6& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+};  // class CartesianProductGenerator6
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class CartesianProductGenerator7
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
+
+  CartesianProductGenerator7(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  virtual ~CartesianProductGenerator7() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current7_;
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator7::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator7& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+};  // class CartesianProductGenerator7
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class CartesianProductGenerator8
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
+
+  CartesianProductGenerator8(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  virtual ~CartesianProductGenerator8() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current8_;
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator8::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator8& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+};  // class CartesianProductGenerator8
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class CartesianProductGenerator9
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
+
+  CartesianProductGenerator9(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  virtual ~CartesianProductGenerator9() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current9_;
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator9::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator9& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+};  // class CartesianProductGenerator9
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class CartesianProductGenerator10
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9, T10> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
+
+  CartesianProductGenerator10(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9,
+      const ParamGenerator<T10>& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  virtual ~CartesianProductGenerator10() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end(), g10_, g10_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9,
+      const ParamGenerator<T10>& g10,
+      const typename ParamGenerator<T10>::iterator& current10)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9),
+          begin10_(g10.begin()), end10_(g10.end()), current10_(current10)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current10_;
+      if (current10_ == end10_) {
+        current10_ = begin10_;
+        ++current9_;
+      }
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_ &&
+          current10_ == typed_other->current10_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_),
+        begin10_(other.begin10_),
+        end10_(other.end10_),
+        current10_(other.current10_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_, *current10_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_ ||
+          current10_ == end10_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    const typename ParamGenerator<T10>::iterator begin10_;
+    const typename ParamGenerator<T10>::iterator end10_;
+    typename ParamGenerator<T10>::iterator current10_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator10::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator10& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+  const ParamGenerator<T10> g10_;
+};  // class CartesianProductGenerator10
+
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Helper classes providing Combine() with polymorphic features. They allow
+// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
+// convertible to U.
+//
+template <class Generator1, class Generator2>
+class CartesianProductHolder2 {
+ public:
+CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
+      : g1_(g1), g2_(g2) {}
+  template <typename T1, typename T2>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2> >(
+        new CartesianProductGenerator2<T1, T2>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder2& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+};  // class CartesianProductHolder2
+
+template <class Generator1, class Generator2, class Generator3>
+class CartesianProductHolder3 {
+ public:
+CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  template <typename T1, typename T2, typename T3>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >(
+        new CartesianProductGenerator3<T1, T2, T3>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder3& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+};  // class CartesianProductHolder3
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4>
+class CartesianProductHolder4 {
+ public:
+CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  template <typename T1, typename T2, typename T3, typename T4>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >(
+        new CartesianProductGenerator4<T1, T2, T3, T4>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder4& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+};  // class CartesianProductHolder4
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5>
+class CartesianProductHolder5 {
+ public:
+CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >(
+        new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder5& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+};  // class CartesianProductHolder5
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6>
+class CartesianProductHolder6 {
+ public:
+CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >(
+        new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder6& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+};  // class CartesianProductHolder6
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7>
+class CartesianProductHolder7 {
+ public:
+CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+      T7> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> >(
+        new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder7& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+};  // class CartesianProductHolder7
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8>
+class CartesianProductHolder8 {
+ public:
+CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7,
+      T8> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
+        new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder8& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+};  // class CartesianProductHolder8
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9>
+class CartesianProductHolder9 {
+ public:
+CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+      T9> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+        T9> >(
+        new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder9& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+};  // class CartesianProductHolder9
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9, class Generator10>
+class CartesianProductHolder10 {
+ public:
+CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9, const Generator10& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9, typename T10>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+      T9, T10> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+        T9, T10> >(
+        new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+            T10>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_),
+        static_cast<ParamGenerator<T10> >(g10_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder10& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+  const Generator10 g10_;
+};  // class CartesianProductHolder10
+
+# endif  // GTEST_HAS_COMBINE
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test case is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test case FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test case StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+//
+// This instantiates tests from test case StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_CASE_P(CharSequence,
+//                         StlStringTest,
+//                         ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_CASE_P(CharSequence2,
+//                         CharTest,
+//                         ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
+      ::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test case BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
+//
+// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+// Currently, Values() supports from 1 to 50 parameters.
+//
+template <typename T1>
+internal::ValueArray1<T1> Values(T1 v1) {
+  return internal::ValueArray1<T1>(v1);
+}
+
+template <typename T1, typename T2>
+internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) {
+  return internal::ValueArray2<T1, T2>(v1, v2);
+}
+
+template <typename T1, typename T2, typename T3>
+internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) {
+  return internal::ValueArray3<T1, T2, T3>(v1, v2, v3);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) {
+  return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5) {
+  return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6) {
+  return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7) {
+  return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5,
+      v6, v7);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) {
+  return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4,
+      v5, v6, v7, v8);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) {
+  return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) {
+  return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+    T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11) {
+  return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+      T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+    T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12) {
+  return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+    T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13) {
+  return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) {
+  return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) {
+  return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16) {
+  return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17) {
+  return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18) {
+  return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) {
+  return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) {
+  return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) {
+  return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22) {
+  return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23) {
+  return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24) {
+  return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) {
+  return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+    T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26) {
+  return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+    T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27) {
+  return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+    T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28) {
+  return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29) {
+  return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) {
+  return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) {
+  return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32) {
+  return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33) {
+  return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34) {
+  return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) {
+  return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) {
+  return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37) {
+  return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38) {
+  return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32,
+      v33, v34, v35, v36, v37, v38);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38, T39 v39) {
+  return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+      v32, v33, v34, v35, v36, v37, v38, v39);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27,
+    T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35,
+    T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) {
+  return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
+      v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+    T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) {
+  return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28,
+      v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+    T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42) {
+  return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41,
+      v42);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+    T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43) {
+  return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40,
+      v41, v42, v43);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43, T44 v44) {
+  return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39,
+      v40, v41, v42, v43, v44);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+    T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+    T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) {
+  return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38,
+      v39, v40, v41, v42, v43, v44, v45);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) {
+  return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) {
+  return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46, v47);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47,
+    T48 v48) {
+  return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36,
+      v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38,
+    T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46,
+    T47 v47, T48 v48, T49 v49) {
+  return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35,
+      v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37,
+    T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45,
+    T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) {
+  return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+      v48, v49, v50);
+}
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test case FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() {
+  return Values(false, true);
+}
+
+# if GTEST_HAS_COMBINE
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Combine can have up to 10 arguments. This number is currently limited
+// by the maximum number of elements in the tuple implementation used by Google
+// Test.
+//
+// Example:
+//
+// This will instantiate tests in test case AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
+//                         Combine(Values("cat", "dog"),
+//                                 Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
+//                         Combine(Bool(), Bool()));
+//
+template <typename Generator1, typename Generator2>
+internal::CartesianProductHolder2<Generator1, Generator2> Combine(
+    const Generator1& g1, const Generator2& g2) {
+  return internal::CartesianProductHolder2<Generator1, Generator2>(
+      g1, g2);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3>
+internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3) {
+  return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>(
+      g1, g2, g3);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4>
+internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+    Generator4> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4) {
+  return internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+      Generator4>(
+      g1, g2, g3, g4);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5>
+internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+    Generator4, Generator5> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5) {
+  return internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+      Generator4, Generator5>(
+      g1, g2, g3, g4, g5);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6>
+internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6) {
+  return internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6>(
+      g1, g2, g3, g4, g5, g6);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7>
+internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7) {
+  return internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7>(
+      g1, g2, g3, g4, g5, g6, g7);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8>
+internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8) {
+  return internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8>(
+      g1, g2, g3, g4, g5, g6, g7, g8);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9>
+internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8,
+    Generator9> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9) {
+  return internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9,
+    typename Generator10>
+internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+    Generator10> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9,
+        const Generator10& g10) {
+  return internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+      Generator10>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9, g10);
+}
+# endif  // GTEST_HAS_COMBINE
+
+
+
+# define TEST_P(test_case_name, test_name) \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+      : public test_case_name { \
+   public: \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
+    virtual void TestBody(); \
+   private: \
+    static int AddToRegistry() { \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
+                  #test_case_name, \
+                  #test_name, \
+                  new ::testing::internal::TestMetaFactory< \
+                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
+      return 0; \
+    } \
+    static int gtest_registering_dummy_; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
+  }; \
+  int GTEST_TEST_CLASS_NAME_(test_case_name, \
+                             test_name)::gtest_registering_dummy_ = \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
+  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
+  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
+  int gtest_##prefix##test_case_name##_dummy_ = \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
+                  #prefix, \
+                  &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                  __FILE__, __LINE__)
+
+}  // namespace testing
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Google C++ Testing Framework definitions useful in production code.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void MyMethod();
+//   FRIEND_TEST(MyClassTest, MyMethod);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, MyMethod) {
+//   // Can call MyClass::MyMethod() here.
+// }
+
+#define FRIEND_TEST(test_case_name, test_name)\
+friend class test_case_name##_##test_name##_Test
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+
+#include <iosfwd>
+#include <vector>
+
+namespace testing {
+
+// A copyable object representing the result of a test part (i.e. an
+// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
+//
+// Don't inherit from TestPartResult as its destructor is not virtual.
+class GTEST_API_ TestPartResult {
+ public:
+  // The possible outcomes of a test part (i.e. an assertion or an
+  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
+  enum Type {
+    kSuccess,          // Succeeded.
+    kNonFatalFailure,  // Failed but the test can continue.
+    kFatalFailure      // Failed and the test should be terminated.
+  };
+
+  // C'tor.  TestPartResult does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestPartResult object.
+  TestPartResult(Type a_type,
+                 const char* a_file_name,
+                 int a_line_number,
+                 const char* a_message)
+      : type_(a_type),
+        file_name_(a_file_name == NULL ? "" : a_file_name),
+        line_number_(a_line_number),
+        summary_(ExtractSummary(a_message)),
+        message_(a_message) {
+  }
+
+  // Gets the outcome of the test part.
+  Type type() const { return type_; }
+
+  // Gets the name of the source file where the test part took place, or
+  // NULL if it's unknown.
+  const char* file_name() const {
+    return file_name_.empty() ? NULL : file_name_.c_str();
+  }
+
+  // Gets the line in the source file where the test part took place,
+  // or -1 if it's unknown.
+  int line_number() const { return line_number_; }
+
+  // Gets the summary of the failure message.
+  const char* summary() const { return summary_.c_str(); }
+
+  // Gets the message associated with the test part.
+  const char* message() const { return message_.c_str(); }
+
+  // Returns true iff the test part passed.
+  bool passed() const { return type_ == kSuccess; }
+
+  // Returns true iff the test part failed.
+  bool failed() const { return type_ != kSuccess; }
+
+  // Returns true iff the test part non-fatally failed.
+  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
+
+  // Returns true iff the test part fatally failed.
+  bool fatally_failed() const { return type_ == kFatalFailure; }
+
+ private:
+  Type type_;
+
+  // Gets the summary of the failure message by omitting the stack
+  // trace in it.
+  static std::string ExtractSummary(const char* message);
+
+  // The name of the source file where the test part took place, or
+  // "" if the source file is unknown.
+  std::string file_name_;
+  // The line in the source file where the test part took place, or -1
+  // if the line number is unknown.
+  int line_number_;
+  std::string summary_;  // The test failure summary.
+  std::string message_;  // The test failure message.
+};
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
+
+// An array of TestPartResult objects.
+//
+// Don't inherit from TestPartResultArray as its destructor is not
+// virtual.
+class GTEST_API_ TestPartResultArray {
+ public:
+  TestPartResultArray() {}
+
+  // Appends the given TestPartResult to the array.
+  void Append(const TestPartResult& result);
+
+  // Returns the TestPartResult at the given index (0-based).
+  const TestPartResult& GetTestPartResult(int index) const;
+
+  // Returns the number of TestPartResult objects in the array.
+  int size() const;
+
+ private:
+  std::vector<TestPartResult> array_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+};
+
+// This interface knows how to report a test part result.
+class TestPartResultReporterInterface {
+ public:
+  virtual ~TestPartResultReporterInterface() {}
+
+  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
+};
+
+namespace internal {
+
+// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
+// statement generates new fatal failures. To do so it registers itself as the
+// current test part result reporter. Besides checking if fatal failures were
+// reported, it only delegates the reporting to the former result reporter.
+// The original result reporter is restored in the destructor.
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+class GTEST_API_ HasNewFatalFailureHelper
+    : public TestPartResultReporterInterface {
+ public:
+  HasNewFatalFailureHelper();
+  virtual ~HasNewFatalFailureHelper();
+  virtual void ReportTestPartResult(const TestPartResult& result);
+  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+ private:
+  bool has_new_fatal_failure_;
+  TestPartResultReporterInterface* original_reporter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// This header implements typed tests and type-parameterized tests.
+
+// Typed (aka type-driven) tests repeat the same test for types in a
+// list.  You must know which types you want to test with when writing
+// typed tests. Here's how you do it:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ public:
+  ...
+  typedef std::list<T> List;
+  static T shared_;
+  T value_;
+};
+
+// Next, associate a list of types with the test case, which will be
+// repeated for each type in the list.  The typedef is necessary for
+// the macro to parse correctly.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+TYPED_TEST_CASE(FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   TYPED_TEST_CASE(FooTest, int);
+
+// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
+// tests for this test case as you want.
+TYPED_TEST(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  // Since we are inside a derived class template, C++ requires use to
+  // visit the members of FooTest via 'this'.
+  TypeParam n = this->value_;
+
+  // To visit static members of the fixture, add the TestFixture::
+  // prefix.
+  n += TestFixture::shared_;
+
+  // To refer to typedefs in the fixture, add the "typename
+  // TestFixture::" prefix.
+  typename TestFixture::List values;
+  values.push_back(n);
+  ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+
+#endif  // 0
+
+// Type-parameterized tests are abstract test patterns parameterized
+// by a type.  Compared with typed tests, type-parameterized tests
+// allow you to define the test pattern without knowing what the type
+// parameters are.  The defined pattern can be instantiated with
+// different types any number of times, in any number of translation
+// units.
+//
+// If you are designing an interface or concept, you can define a
+// suite of type-parameterized tests to verify properties that any
+// valid implementation of the interface/concept should have.  Then,
+// each implementation can easily instantiate the test suite to verify
+// that it conforms to the requirements, without having to write
+// similar tests repeatedly.  Here's an example:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+  ...
+};
+
+// Next, declare that you will define a type-parameterized test case
+// (the _P suffix is for "parameterized" or "pattern", whichever you
+// prefer):
+TYPED_TEST_CASE_P(FooTest);
+
+// Then, use TYPED_TEST_P() to define as many type-parameterized tests
+// for this type-parameterized test case as you want.
+TYPED_TEST_P(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  TypeParam n = 0;
+  ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+
+// Now the tricky part: you need to register all test patterns before
+// you can instantiate them.  The first argument of the macro is the
+// test case name; the rest are the names of the tests in this test
+// case.
+REGISTER_TYPED_TEST_CASE_P(FooTest,
+                           DoesBlah, HasPropertyA);
+
+// Finally, you are free to instantiate the pattern with the types you
+// want.  If you put the above code in a header file, you can #include
+// it in multiple C++ source files and instantiate it multiple times.
+//
+// To distinguish different instances of the pattern, the first
+// argument to the INSTANTIATE_* macro is a prefix that will be added
+// to the actual test case name.  Remember to pick unique prefixes for
+// different instances.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
+
+#endif  // 0
+
+
+// Implements typed tests.
+
+#if GTEST_HAS_TYPED_TEST
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the typedef for the type parameters of the
+// given test case.
+# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define TYPED_TEST_CASE(CaseName, Types) \
+  typedef ::testing::internal::TypeList< Types >::type \
+      GTEST_TYPE_PARAMS_(CaseName)
+
+# define TYPED_TEST(CaseName, TestName) \
+  template <typename gtest_TypeParam_> \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
+      : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTest< \
+          CaseName, \
+          ::testing::internal::TemplateSel< \
+              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
+          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
+              "", #CaseName, #TestName, 0); \
+  template <typename gtest_TypeParam_> \
+  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Implements type-parameterized tests.
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the namespace name that the type-parameterized tests for
+// the given type-parameterized test case are defined in.  The exact
+// name of the namespace is subject to change without notice.
+# define GTEST_CASE_NAMESPACE_(TestCaseName) \
+  gtest_case_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the variable used to remember the names of
+// the defined tests in the given test case.
+# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \
+  gtest_typed_test_case_p_state_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
+//
+// Expands to the name of the variable used to remember the names of
+// the registered tests in the given test case.
+# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \
+  gtest_registered_test_names_##TestCaseName##_
+
+// The variables defined in the type-parameterized test macros are
+// static as typically these macros are used in a .h file that can be
+// #included in multiple translation units linked together.
+# define TYPED_TEST_CASE_P(CaseName) \
+  static ::testing::internal::TypedTestCasePState \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName)
+
+# define TYPED_TEST_P(CaseName, TestName) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  template <typename gtest_TypeParam_> \
+  class TestName : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\
+          __FILE__, __LINE__, #CaseName, #TestName); \
+  } \
+  template <typename gtest_TypeParam_> \
+  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
+
+# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
+  } \
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
+          __FILE__, __LINE__, #__VA_ARGS__)
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
+  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTestCase<CaseName, \
+          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
+          ::testing::internal::TypeList< Types >::type>::Register(\
+              #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// Depending on the platform, different string classes are available.
+// On Linux, in addition to ::std::string, Google also makes use of
+// class ::string, which has the same interface as ::std::string, but
+// has a different implementation.
+//
+// The user can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
+// ::string is available AND is a distinct type to ::std::string, or
+// define it to 0 to indicate otherwise.
+//
+// If the user's ::std::string and ::string are the same class due to
+// aliasing, he should define GTEST_HAS_GLOBAL_STRING to 0.
+//
+// If the user doesn't define GTEST_HAS_GLOBAL_STRING, it is defined
+// heuristically.
+
+namespace testing {
+
+// Declares the flags.
+
+// This flag temporary enables the disabled tests.
+GTEST_DECLARE_bool_(also_run_disabled_tests);
+
+// This flag brings the debugger on an assertion failure.
+GTEST_DECLARE_bool_(break_on_failure);
+
+// This flag controls whether Google Test catches all test-thrown exceptions
+// and logs them as failures.
+GTEST_DECLARE_bool_(catch_exceptions);
+
+// This flag enables using colors in terminal output. Available values are
+// "yes" to enable colors, "no" (disable colors), or "auto" (the default)
+// to let Google Test decide.
+GTEST_DECLARE_string_(color);
+
+// This flag sets up the filter to select by name using a glob pattern
+// the tests to run. If the filter is not given all tests are executed.
+GTEST_DECLARE_string_(filter);
+
+// This flag causes the Google Test to list tests. None of the tests listed
+// are actually run if the flag is provided.
+GTEST_DECLARE_bool_(list_tests);
+
+// This flag controls whether Google Test emits a detailed XML report to a file
+// in addition to its normal textual output.
+GTEST_DECLARE_string_(output);
+
+// This flags control whether Google Test prints the elapsed time for each
+// test.
+GTEST_DECLARE_bool_(print_time);
+
+// This flag specifies the random number seed.
+GTEST_DECLARE_int32_(random_seed);
+
+// This flag sets how many times the tests are repeated. The default value
+// is 1. If the value is -1 the tests are repeating forever.
+GTEST_DECLARE_int32_(repeat);
+
+// This flag controls whether Google Test includes Google Test internal
+// stack frames in failure stack traces.
+GTEST_DECLARE_bool_(show_internal_stack_frames);
+
+// When this flag is specified, tests' order is randomized on every iteration.
+GTEST_DECLARE_bool_(shuffle);
+
+// This flag specifies the maximum number of stack frames to be
+// printed in a failure message.
+GTEST_DECLARE_int32_(stack_trace_depth);
+
+// When this flag is specified, a failed assertion will throw an
+// exception if exceptions are enabled, or exit the program with a
+// non-zero code otherwise.
+GTEST_DECLARE_bool_(throw_on_failure);
+
+// When this flag is set with a "host:port" string, on supported
+// platforms test results are streamed to the specified port on
+// the specified host machine.
+GTEST_DECLARE_string_(stream_result_to);
+
+// The upper limit for valid stack trace depths.
+const int kMaxStackTraceDepth = 100;
+
+namespace internal {
+
+class AssertHelper;
+class DefaultGlobalTestPartResultReporter;
+class ExecDeathTest;
+class NoExecDeathTest;
+class FinalSuccessChecker;
+class GTestFlagSaver;
+class StreamingListenerTest;
+class TestResultAccessor;
+class TestEventListenersAccessor;
+class TestEventRepeater;
+class UnitTestRecordPropertyTestHelper;
+class WindowsDeathTest;
+class UnitTestImpl* GetUnitTestImpl();
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message);
+
+}  // namespace internal
+
+// The friend relationship of some of these classes is cyclic.
+// If we don't forward declare them the compiler might confuse the classes
+// in friendship clauses with same named classes on the scope.
+class Test;
+class TestCase;
+class TestInfo;
+class UnitTest;
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  explicit AssertionResult(bool success) : success_(success) {}
+
+  // Returns true iff the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != NULL ?  message_->c_str() : "";
+  }
+  // TODO(vladl@google.com): Remove this after making sure no clients use it.
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T> AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == NULL)
+      message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  internal::scoped_ptr< ::std::string> message_;
+
+  GTEST_DISALLOW_ASSIGN_(AssertionResult);
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+// The abstract class that all tests inherit from.
+//
+// In Google Test, a unit test program contains one or many TestCases, and
+// each TestCase contains one or many Tests.
+//
+// When you define a test using the TEST macro, you don't need to
+// explicitly derive from Test - the TEST macro automatically does
+// this for you.
+//
+// The only time you derive from Test is when defining a test fixture
+// to be used a TEST_F.  For example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     virtual void SetUp() { ... }
+//     virtual void TearDown() { ... }
+//     ...
+//   };
+//
+//   TEST_F(FooTest, Bar) { ... }
+//   TEST_F(FooTest, Baz) { ... }
+//
+// Test is not copyable.
+class GTEST_API_ Test {
+ public:
+  friend class TestInfo;
+
+  // Defines types for pointers to functions that set up and tear down
+  // a test case.
+  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
+  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
+
+  // The d'tor is virtual as we intend to inherit from Test.
+  virtual ~Test();
+
+  // Sets up the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::SetUpTestCase() before running the first
+  // test in test case Foo.  Hence a sub-class can define its own
+  // SetUpTestCase() method to shadow the one defined in the super
+  // class.
+  static void SetUpTestCase() {}
+
+  // Tears down the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::TearDownTestCase() after running the last
+  // test in test case Foo.  Hence a sub-class can define its own
+  // TearDownTestCase() method to shadow the one defined in the super
+  // class.
+  static void TearDownTestCase() {}
+
+  // Returns true iff the current test has a fatal failure.
+  static bool HasFatalFailure();
+
+  // Returns true iff the current test has a non-fatal failure.
+  static bool HasNonfatalFailure();
+
+  // Returns true iff the current test has a (either fatal or
+  // non-fatal) failure.
+  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
+
+  // Logs a property for the current test, test case, or for the entire
+  // invocation of the test program when used outside of the context of a
+  // test case.  Only the last value for a given key is remembered.  These
+  // are public static so they can be called from utility functions that are
+  // not members of the test fixture.  Calls to RecordProperty made during
+  // lifespan of the test (from the moment its constructor starts to the
+  // moment its destructor finishes) will be output in XML as attributes of
+  // the <testcase> element.  Properties recorded from fixture's
+  // SetUpTestCase or TearDownTestCase are logged as attributes of the
+  // corresponding <testsuite> element.  Calls to RecordProperty made in the
+  // global context (before or after invocation of RUN_ALL_TESTS and from
+  // SetUp/TearDown method of Environment objects registered with Google
+  // Test) will be output as attributes of the <testsuites> element.
+  static void RecordProperty(const std::string& key, const std::string& value);
+  static void RecordProperty(const std::string& key, int value);
+
+ protected:
+  // Creates a Test object.
+  Test();
+
+  // Sets up the test fixture.
+  virtual void SetUp();
+
+  // Tears down the test fixture.
+  virtual void TearDown();
+
+ private:
+  // Returns true iff the current test has the same fixture class as
+  // the first test in the current test case.
+  static bool HasSameFixtureClass();
+
+  // Runs the test after the test fixture has been set up.
+  //
+  // A sub-class must implement this to define the test logic.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
+  // Instead, use the TEST or TEST_F macro.
+  virtual void TestBody() = 0;
+
+  // Sets up, executes, and tears down the test.
+  void Run();
+
+  // Deletes self.  We deliberately pick an unusual name for this
+  // internal method to avoid clashing with names used in user TESTs.
+  void DeleteSelf_() { delete this; }
+
+  // Uses a GTestFlagSaver to save and restore all Google Test flags.
+  const internal::GTestFlagSaver* const gtest_flag_saver_;
+
+  // Often a user mis-spells SetUp() as Setup() and spends a long time
+  // wondering why it is never called by Google Test.  The declaration of
+  // the following method is solely for catching such an error at
+  // compile time:
+  //
+  //   - The return type is deliberately chosen to be not void, so it
+  //   will be a conflict if a user declares void Setup() in his test
+  //   fixture.
+  //
+  //   - This method is private, so it will be another compiler error
+  //   if a user calls it from his test fixture.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION.
+  //
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+
+  // We disallow copying Tests.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+};
+
+typedef internal::TimeInMillis TimeInMillis;
+
+// A copyable object representing a user specified test property which can be
+// output as a key/value string pair.
+//
+// Don't inherit from TestProperty as its destructor is not virtual.
+class TestProperty {
+ public:
+  // C'tor.  TestProperty does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestProperty object.
+  TestProperty(const std::string& a_key, const std::string& a_value) :
+    key_(a_key), value_(a_value) {
+  }
+
+  // Gets the user supplied key.
+  const char* key() const {
+    return key_.c_str();
+  }
+
+  // Gets the user supplied value.
+  const char* value() const {
+    return value_.c_str();
+  }
+
+  // Sets a new value, overriding the one supplied in the constructor.
+  void SetValue(const std::string& new_value) {
+    value_ = new_value;
+  }
+
+ private:
+  // The key supplied by the user.
+  std::string key_;
+  // The value supplied by the user.
+  std::string value_;
+};
+
+// The result of a single Test.  This includes a list of
+// TestPartResults, a list of TestProperties, a count of how many
+// death tests there are in the Test, and how much time it took to run
+// the Test.
+//
+// TestResult is not copyable.
+class GTEST_API_ TestResult {
+ public:
+  // Creates an empty TestResult.
+  TestResult();
+
+  // D'tor.  Do not inherit from TestResult.
+  ~TestResult();
+
+  // Gets the number of all test parts.  This is the sum of the number
+  // of successful test parts and the number of failed test parts.
+  int total_part_count() const;
+
+  // Returns the number of the test properties.
+  int test_property_count() const;
+
+  // Returns true iff the test passed (i.e. no test part failed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test failed.
+  bool Failed() const;
+
+  // Returns true iff the test fatally failed.
+  bool HasFatalFailure() const;
+
+  // Returns true iff the test has a non-fatal failure.
+  bool HasNonfatalFailure() const;
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test part result among all the results. i can range
+  // from 0 to test_property_count() - 1. If i is not in that range, aborts
+  // the program.
+  const TestPartResult& GetTestPartResult(int i) const;
+
+  // Returns the i-th test property. i can range from 0 to
+  // test_property_count() - 1. If i is not in that range, aborts the
+  // program.
+  const TestProperty& GetTestProperty(int i) const;
+
+ private:
+  friend class TestInfo;
+  friend class TestCase;
+  friend class UnitTest;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::ExecDeathTest;
+  friend class internal::TestResultAccessor;
+  friend class internal::UnitTestImpl;
+  friend class internal::WindowsDeathTest;
+
+  // Gets the vector of TestPartResults.
+  const std::vector<TestPartResult>& test_part_results() const {
+    return test_part_results_;
+  }
+
+  // Gets the vector of TestProperties.
+  const std::vector<TestProperty>& test_properties() const {
+    return test_properties_;
+  }
+
+  // Sets the elapsed time.
+  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
+
+  // Adds a test property to the list. The property is validated and may add
+  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
+  // key names). If a property is already recorded for the same key, the
+  // value will be updated, rather than storing multiple values for the same
+  // key.  xml_element specifies the element for which the property is being
+  // recorded and is used for validation.
+  void RecordProperty(const std::string& xml_element,
+                      const TestProperty& test_property);
+
+  // Adds a failure if the key is a reserved attribute of Google Test
+  // testcase tags.  Returns true if the property is valid.
+  // TODO(russr): Validate attribute names are legal and human readable.
+  static bool ValidateTestProperty(const std::string& xml_element,
+                                   const TestProperty& test_property);
+
+  // Adds a test part result to the list.
+  void AddTestPartResult(const TestPartResult& test_part_result);
+
+  // Returns the death test count.
+  int death_test_count() const { return death_test_count_; }
+
+  // Increments the death test count, returning the new count.
+  int increment_death_test_count() { return ++death_test_count_; }
+
+  // Clears the test part results.
+  void ClearTestPartResults();
+
+  // Clears the object.
+  void Clear();
+
+  // Protects mutable state of the property vector and of owned
+  // properties, whose values may be updated.
+  internal::Mutex test_properites_mutex_;
+
+  // The vector of TestPartResults
+  std::vector<TestPartResult> test_part_results_;
+  // The vector of TestProperties
+  std::vector<TestProperty> test_properties_;
+  // Running count of death tests.
+  int death_test_count_;
+  // The elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+  // We disallow copying TestResult.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+};  // class TestResult
+
+// A TestInfo object stores the following information about a test:
+//
+//   Test case name
+//   Test name
+//   Whether the test should be run
+//   A function pointer that creates the test object when invoked
+//   Test result
+//
+// The constructor of TestInfo registers itself with the UnitTest
+// singleton such that the RUN_ALL_TESTS() macro knows which tests to
+// run.
+class GTEST_API_ TestInfo {
+ public:
+  // Destructs a TestInfo object.  This function is not virtual, so
+  // don't inherit from TestInfo.
+  ~TestInfo();
+
+  // Returns the test case name.
+  const char* test_case_name() const { return test_case_name_.c_str(); }
+
+  // Returns the test name.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a typed
+  // or a type-parameterized test.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns the text representation of the value parameter, or NULL if this
+  // is not a value-parameterized test.
+  const char* value_param() const {
+    if (value_param_.get() != NULL)
+      return value_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if this test should run, that is if the test is not
+  // disabled (or it is disabled but the also_run_disabled_tests flag has
+  // been specified) and its full name matches the user-specified filter.
+  //
+  // Google Test allows the user to filter the tests by their full names.
+  // The full name of a test Bar in test case Foo is defined as
+  // "Foo.Bar".  Only the tests that match the filter will run.
+  //
+  // A filter is a colon-separated list of glob (not regex) patterns,
+  // optionally followed by a '-' and a colon-separated list of
+  // negative patterns (tests to exclude).  A test is run if it
+  // matches one of the positive patterns and does not match any of
+  // the negative patterns.
+  //
+  // For example, *A*:Foo.* is a filter that matches any string that
+  // contains the character 'A' or starts with "Foo.".
+  bool should_run() const { return should_run_; }
+
+  // Returns true iff this test will appear in the XML report.
+  bool is_reportable() const {
+    // For now, the XML report includes all tests matching the filter.
+    // In the future, we may trim tests that are excluded because of
+    // sharding.
+    return matches_filter_;
+  }
+
+  // Returns the result of the test.
+  const TestResult* result() const { return &result_; }
+
+ private:
+#if GTEST_HAS_DEATH_TEST
+  friend class internal::DefaultDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+  friend class Test;
+  friend class TestCase;
+  friend class internal::UnitTestImpl;
+  friend class internal::StreamingListenerTest;
+  friend TestInfo* internal::MakeAndRegisterTestInfo(
+      const char* test_case_name,
+      const char* name,
+      const char* type_param,
+      const char* value_param,
+      internal::TypeId fixture_class_id,
+      Test::SetUpTestCaseFunc set_up_tc,
+      Test::TearDownTestCaseFunc tear_down_tc,
+      internal::TestFactoryBase* factory);
+
+  // Constructs a TestInfo object. The newly constructed instance assumes
+  // ownership of the factory object.
+  TestInfo(const std::string& test_case_name,
+           const std::string& name,
+           const char* a_type_param,   // NULL if not a type-parameterized test
+           const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::TypeId fixture_class_id,
+           internal::TestFactoryBase* factory);
+
+  // Increments the number of death tests encountered in this test so
+  // far.
+  int increment_death_test_count() {
+    return result_.increment_death_test_count();
+  }
+
+  // Creates the test object, runs it, records its result, and then
+  // deletes it.
+  void Run();
+
+  static void ClearTestResult(TestInfo* test_info) {
+    test_info->result_.Clear();
+  }
+
+  // These fields are immutable properties of the test.
+  const std::string test_case_name_;     // Test case name
+  const std::string name_;               // Test name
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // Text representation of the value parameter, or NULL if this is not a
+  // value-parameterized test.
+  const internal::scoped_ptr<const ::std::string> value_param_;
+  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
+  bool should_run_;                 // True iff this test should run
+  bool is_disabled_;                // True iff this test is disabled
+  bool matches_filter_;             // True if this test matches the
+                                    // user-specified filter.
+  internal::TestFactoryBase* const factory_;  // The factory that creates
+                                              // the test object
+
+  // This field is mutable and needs to be reset before running the
+  // test for the second time.
+  TestResult result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+};
+
+// A test case, which consists of a vector of TestInfos.
+//
+// TestCase is not copyable.
+class GTEST_API_ TestCase {
+ public:
+  // Creates a TestCase with the given name.
+  //
+  // TestCase does NOT have a default constructor.  Always use this
+  // constructor to create a TestCase object.
+  //
+  // Arguments:
+  //
+  //   name:         name of the test case
+  //   a_type_param: the name of the test's type parameter, or NULL if
+  //                 this is not a type-parameterized test.
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  TestCase(const char* name, const char* a_type_param,
+           Test::SetUpTestCaseFunc set_up_tc,
+           Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Destructor of TestCase.
+  virtual ~TestCase();
+
+  // Gets the name of the TestCase.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a
+  // type-parameterized test case.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if any test in this test case should run.
+  bool should_run() const { return should_run_; }
+
+  // Gets the number of successful tests in this test case.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests in this test case.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests in this test case.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Get the number of tests in this test case that should run.
+  int test_to_run_count() const;
+
+  // Gets the number of all tests in this test case.
+  int total_test_count() const;
+
+  // Returns true iff the test case passed.
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test case failed.
+  bool Failed() const { return failed_test_count() > 0; }
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  const TestInfo* GetTestInfo(int i) const;
+
+  // Returns the TestResult that holds test properties recorded during
+  // execution of SetUpTestCase and TearDownTestCase.
+  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
+ private:
+  friend class Test;
+  friend class internal::UnitTestImpl;
+
+  // Gets the (mutable) vector of TestInfos in this TestCase.
+  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+
+  // Gets the (immutable) vector of TestInfos in this TestCase.
+  const std::vector<TestInfo*>& test_info_list() const {
+    return test_info_list_;
+  }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  TestInfo* GetMutableTestInfo(int i);
+
+  // Sets the should_run member.
+  void set_should_run(bool should) { should_run_ = should; }
+
+  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
+  // destruction of the TestCase object.
+  void AddTestInfo(TestInfo * test_info);
+
+  // Clears the results of all tests in this test case.
+  void ClearResult();
+
+  // Clears the results of all tests in the given test case.
+  static void ClearTestCaseResult(TestCase* test_case) {
+    test_case->ClearResult();
+  }
+
+  // Runs every test in this TestCase.
+  void Run();
+
+  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestCase().
+  void RunSetUpTestCase() { (*set_up_tc_)(); }
+
+  // Runs TearDownTestCase() for this TestCase.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestCase().
+  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+
+  // Returns true iff test passed.
+  static bool TestPassed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Passed();
+  }
+
+  // Returns true iff test failed.
+  static bool TestFailed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Failed();
+  }
+
+  // Returns true iff the test is disabled and will be reported in the XML
+  // report.
+  static bool TestReportableDisabled(const TestInfo* test_info) {
+    return test_info->is_reportable() && test_info->is_disabled_;
+  }
+
+  // Returns true iff test is disabled.
+  static bool TestDisabled(const TestInfo* test_info) {
+    return test_info->is_disabled_;
+  }
+
+  // Returns true iff this test will appear in the XML report.
+  static bool TestReportable(const TestInfo* test_info) {
+    return test_info->is_reportable();
+  }
+
+  // Returns true if the given test should run.
+  static bool ShouldRunTest(const TestInfo* test_info) {
+    return test_info->should_run();
+  }
+
+  // Shuffles the tests in this test case.
+  void ShuffleTests(internal::Random* random);
+
+  // Restores the test order to before the first shuffle.
+  void UnshuffleTests();
+
+  // Name of the test case.
+  std::string name_;
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // The vector of TestInfos in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestInfo*> test_info_list_;
+  // Provides a level of indirection for the test list to allow easy
+  // shuffling and restoring the test order.  The i-th element in this
+  // vector is the index of the i-th test in the shuffled test list.
+  std::vector<int> test_indices_;
+  // Pointer to the function that sets up the test case.
+  Test::SetUpTestCaseFunc set_up_tc_;
+  // Pointer to the function that tears down the test case.
+  Test::TearDownTestCaseFunc tear_down_tc_;
+  // True iff any test in this test case should run.
+  bool should_run_;
+  // Elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+  // Holds test properties recorded during execution of SetUpTestCase and
+  // TearDownTestCase.
+  TestResult ad_hoc_test_result_;
+
+  // We disallow copying TestCases.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+};
+
+// An Environment object is capable of setting up and tearing down an
+// environment.  The user should subclass this to define his own
+// environment(s).
+//
+// An Environment object does the set-up and tear-down in virtual
+// methods SetUp() and TearDown() instead of the constructor and the
+// destructor, as:
+//
+//   1. You cannot safely throw from a destructor.  This is a problem
+//      as in some cases Google Test is used where exceptions are enabled, and
+//      we may want to implement ASSERT_* using exceptions where they are
+//      available.
+//   2. You cannot use ASSERT_* directly in a constructor or
+//      destructor.
+class Environment {
+ public:
+  // The d'tor is virtual as we need to subclass Environment.
+  virtual ~Environment() {}
+
+  // Override this to define how to set up the environment.
+  virtual void SetUp() {}
+
+  // Override this to define how to tear down the environment.
+  virtual void TearDown() {}
+ private:
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+};
+
+// The interface for tracing execution of tests. The methods are organized in
+// the order the corresponding events are fired.
+class TestEventListener {
+ public:
+  virtual ~TestEventListener() {}
+
+  // Fired before any test activity starts.
+  virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
+
+  // Fired before each iteration of tests starts.  There may be more than
+  // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
+  // index, starting from 0.
+  virtual void OnTestIterationStart(const UnitTest& unit_test,
+                                    int iteration) = 0;
+
+  // Fired before environment set-up for each iteration of tests starts.
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment set-up for each iteration of tests ends.
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
+
+  // Fired before the test case starts.
+  virtual void OnTestCaseStart(const TestCase& test_case) = 0;
+
+  // Fired before the test starts.
+  virtual void OnTestStart(const TestInfo& test_info) = 0;
+
+  // Fired after a failed assertion or a SUCCEED() invocation.
+  virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
+
+  // Fired after the test ends.
+  virtual void OnTestEnd(const TestInfo& test_info) = 0;
+
+  // Fired after the test case ends.
+  virtual void OnTestCaseEnd(const TestCase& test_case) = 0;
+
+  // Fired before environment tear-down for each iteration of tests starts.
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment tear-down for each iteration of tests ends.
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
+
+  // Fired after each iteration of tests finishes.
+  virtual void OnTestIterationEnd(const UnitTest& unit_test,
+                                  int iteration) = 0;
+
+  // Fired after all test activities have ended.
+  virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
+};
+
+// The convenience class for users who need to override just one or two
+// methods and are not concerned that a possible change to a signature of
+// the methods they override will not be caught during the build.  For
+// comments about each method please see the definition of TestEventListener
+// above.
+class EmptyTestEventListener : public TestEventListener {
+ public:
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                                    int /*iteration*/) {}
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
+  virtual void OnTestStart(const TestInfo& /*test_info*/) {}
+  virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {}
+  virtual void OnTestEnd(const TestInfo& /*test_info*/) {}
+  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                                  int /*iteration*/) {}
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+};
+
+// TestEventListeners lets users add listeners to track events in Google Test.
+class GTEST_API_ TestEventListeners {
+ public:
+  TestEventListeners();
+  ~TestEventListeners();
+
+  // Appends an event listener to the end of the list. Google Test assumes
+  // the ownership of the listener (i.e. it will delete the listener when
+  // the test program finishes).
+  void Append(TestEventListener* listener);
+
+  // Removes the given event listener from the list and returns it.  It then
+  // becomes the caller's responsibility to delete the listener. Returns
+  // NULL if the listener is not found in the list.
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Returns the standard listener responsible for the default console
+  // output.  Can be removed from the listeners list to shut down default
+  // console output.  Note that removing this object from the listener list
+  // with Release transfers its ownership to the caller and makes this
+  // function return NULL the next time.
+  TestEventListener* default_result_printer() const {
+    return default_result_printer_;
+  }
+
+  // Returns the standard listener responsible for the default XML output
+  // controlled by the --gtest_output=xml flag.  Can be removed from the
+  // listeners list by users who want to shut down the default XML output
+  // controlled by this flag and substitute it with custom one.  Note that
+  // removing this object from the listener list with Release transfers its
+  // ownership to the caller and makes this function return NULL the next
+  // time.
+  TestEventListener* default_xml_generator() const {
+    return default_xml_generator_;
+  }
+
+ private:
+  friend class TestCase;
+  friend class TestInfo;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::NoExecDeathTest;
+  friend class internal::TestEventListenersAccessor;
+  friend class internal::UnitTestImpl;
+
+  // Returns repeater that broadcasts the TestEventListener events to all
+  // subscribers.
+  TestEventListener* repeater();
+
+  // Sets the default_result_printer attribute to the provided listener.
+  // The listener is also added to the listener list and previous
+  // default_result_printer is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultResultPrinter(TestEventListener* listener);
+
+  // Sets the default_xml_generator attribute to the provided listener.  The
+  // listener is also added to the listener list and previous
+  // default_xml_generator is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultXmlGenerator(TestEventListener* listener);
+
+  // Controls whether events will be forwarded by the repeater to the
+  // listeners in the list.
+  bool EventForwardingEnabled() const;
+  void SuppressEventForwarding();
+
+  // The actual list of listeners.
+  internal::TestEventRepeater* repeater_;
+  // Listener responsible for the standard result output.
+  TestEventListener* default_result_printer_;
+  // Listener responsible for the creation of the XML output file.
+  TestEventListener* default_xml_generator_;
+
+  // We disallow copying TestEventListeners.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+};
+
+// A UnitTest consists of a vector of TestCases.
+//
+// This is a singleton class.  The only instance of UnitTest is
+// created when UnitTest::GetInstance() is first called.  This
+// instance is never deleted.
+//
+// UnitTest is not copyable.
+//
+// This class is thread-safe as long as the methods are called
+// according to their specification.
+class GTEST_API_ UnitTest {
+ public:
+  // Gets the singleton UnitTest object.  The first time this method
+  // is called, a UnitTest object is constructed and returned.
+  // Consecutive calls will return the same object.
+  static UnitTest* GetInstance();
+
+  // Runs all tests in this UnitTest object and prints the result.
+  // Returns 0 if successful, or 1 otherwise.
+  //
+  // This method can only be called from the main thread.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  int Run() GTEST_MUST_USE_RESULT_;
+
+  // Returns the working directory when the first TEST() or TEST_F()
+  // was executed.  The UnitTest object owns the string.
+  const char* original_working_dir() const;
+
+  // Returns the TestCase object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestCase* current_test_case() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the TestInfo object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestInfo* current_test_info() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the random seed used at the start of the current test run.
+  int random_seed() const;
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns the ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const;
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const;
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const;
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const;
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const;
+
+  // Returns the TestResult containing information on test failures and
+  // properties logged outside of individual test cases.
+  const TestResult& ad_hoc_test_result() const;
+
+  // Returns the list of event listeners that can be used to track events
+  // inside Google Test.
+  TestEventListeners& listeners();
+
+ private:
+  // Registers and returns a global test environment.  When a test
+  // program is run, all global test environments will be set-up in
+  // the order they were registered.  After all tests in the program
+  // have finished, all global test environments will be torn-down in
+  // the *reverse* order they were registered.
+  //
+  // The UnitTest object takes ownership of the given environment.
+  //
+  // This method can only be called from the main thread.
+  Environment* AddEnvironment(Environment* env);
+
+  // Adds a TestPartResult to the current TestResult object.  All
+  // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
+  // eventually call this to report their results.  The user code
+  // should use the assertion macros instead of calling this directly.
+  void AddTestPartResult(TestPartResult::Type result_type,
+                         const char* file_name,
+                         int line_number,
+                         const std::string& message,
+                         const std::string& os_stack_trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Adds a TestProperty to the current TestResult object when invoked from
+  // inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+  // from SetUpTestCase or TearDownTestCase, or to the global property set
+  // when invoked elsewhere.  If the result already contains a property with
+  // the same key, the value will be updated.
+  void RecordProperty(const std::string& key, const std::string& value);
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i);
+
+  // Accessors for the implementation object.
+  internal::UnitTestImpl* impl() { return impl_; }
+  const internal::UnitTestImpl* impl() const { return impl_; }
+
+  // These classes and funcions are friends as they need to access private
+  // members of UnitTest.
+  friend class Test;
+  friend class internal::AssertHelper;
+  friend class internal::ScopedTrace;
+  friend class internal::StreamingListenerTest;
+  friend class internal::UnitTestRecordPropertyTestHelper;
+  friend Environment* AddGlobalTestEnvironment(Environment* env);
+  friend internal::UnitTestImpl* internal::GetUnitTestImpl();
+  friend void internal::ReportFailureInUnknownLocation(
+      TestPartResult::Type result_type,
+      const std::string& message);
+
+  // Creates an empty UnitTest.
+  UnitTest();
+
+  // D'tor
+  virtual ~UnitTest();
+
+  // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+  // Google Test trace stack.
+  void PushGTestTrace(const internal::TraceInfo& trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Pops a trace from the per-thread Google Test trace stack.
+  void PopGTestTrace()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Protects mutable state in *impl_.  This is mutable as some const
+  // methods need to lock it too.
+  mutable internal::Mutex mutex_;
+
+  // Opaque implementation object.  This field is never changed once
+  // the object is constructed.  We don't mark it as const here, as
+  // doing so will cause a warning in the constructor of UnitTest.
+  // Mutable state in *impl_ is protected by mutex_.
+  internal::UnitTestImpl* impl_;
+
+  // We disallow copying UnitTest.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+};
+
+// A convenient wrapper for adding an environment for the test
+// program.
+//
+// You should call this before RUN_ALL_TESTS() is called, probably in
+// main().  If you use gtest_main, you need to call this before main()
+// starts for it to take effect.  For example, you can define a global
+// variable like this:
+//
+//   testing::Environment* const foo_env =
+//       testing::AddGlobalTestEnvironment(new FooEnvironment);
+//
+// However, we strongly recommend you to write your own main() and
+// call AddGlobalTestEnvironment() there, as relying on initialization
+// of global variables makes the code harder to read and may cause
+// problems when you register multiple environments from different
+// translation units and the environments have dependencies among them
+// (remember that the compiler doesn't guarantee the order in which
+// global variables from different translation units are initialized).
+inline Environment* AddGlobalTestEnvironment(Environment* env) {
+  return UnitTest::GetInstance()->AddEnvironment(env);
+}
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+GTEST_API_ void InitGoogleTest(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
+
+namespace internal {
+
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
+#endif
+
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
+#endif
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
+// The helper function for {ASSERT|EXPECT}_EQ.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQ(const char* expected_expression,
+                            const char* actual_expression,
+                            const T1& expected,
+                            const T2& actual) {
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4389)  // Temporarily disables warning on
+                                // signed/unsigned mismatch.
+#endif
+
+  if (expected == actual) {
+    return AssertionSuccess();
+  }
+
+#ifdef _MSC_VER
+# pragma warning(pop)          // Restores the warning state.
+#endif
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   FormatForComparisonFailureMessage(expected, actual),
+                   FormatForComparisonFailureMessage(actual, expected),
+                   false);
+}
+
+// With this overloaded version, we allow anonymous enums to be used
+// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
+// can be implicitly cast to BiggestInt.
+GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression,
+                                       const char* actual_expression,
+                                       BiggestInt expected,
+                                       BiggestInt actual);
+
+// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
+// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
+// is a null pointer literal.  The following default implementation is
+// for lhs_is_null_literal being false.
+template <bool lhs_is_null_literal>
+class EqHelper {
+ public:
+  // This templatized version is for the general case.
+  template <typename T1, typename T2>
+  static AssertionResult Compare(const char* expected_expression,
+                                 const char* actual_expression,
+                                 const T1& expected,
+                                 const T2& actual) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+
+  // With this overloaded version, we allow anonymous enums to be used
+  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
+  // enums can be implicitly cast to BiggestInt.
+  //
+  // Even though its body looks the same as the above version, we
+  // cannot merge the two, as it will make anonymous enums unhappy.
+  static AssertionResult Compare(const char* expected_expression,
+                                 const char* actual_expression,
+                                 BiggestInt expected,
+                                 BiggestInt actual) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+};
+
+// This specialization is used when the first argument to ASSERT_EQ()
+// is a null pointer literal, like NULL, false, or 0.
+template <>
+class EqHelper<true> {
+ public:
+  // We define two overloaded versions of Compare().  The first
+  // version will be picked when the second argument to ASSERT_EQ() is
+  // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or
+  // EXPECT_EQ(false, a_bool).
+  template <typename T1, typename T2>
+  static AssertionResult Compare(
+      const char* expected_expression,
+      const char* actual_expression,
+      const T1& expected,
+      const T2& actual,
+      // The following line prevents this overload from being considered if T2
+      // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
+      // expands to Compare("", "", NULL, my_ptr), which requires a conversion
+      // to match the Secret* in the other overload, which would otherwise make
+      // this template match better.
+      typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+
+  // This version will be picked when the second argument to ASSERT_EQ() is a
+  // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
+  template <typename T>
+  static AssertionResult Compare(
+      const char* expected_expression,
+      const char* actual_expression,
+      // We used to have a second template parameter instead of Secret*.  That
+      // template parameter would deduce to 'long', making this a better match
+      // than the first overload even without the first overload's EnableIf.
+      // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
+      // non-pointer argument" (even a deduced integral argument), so the old
+      // implementation caused warnings in user code.
+      Secret* /* expected (NULL) */,
+      T* actual) {
+    // We already know that 'expected' is a null pointer.
+    return CmpHelperEQ(expected_expression, actual_expression,
+                       static_cast<T*>(NULL), actual);
+  }
+};
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
+// of similar code.
+//
+// For each templatized helper function, we also define an overloaded
+// version for BiggestInt in order to reduce code bloat and allow
+// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
+// with gcc 4.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+template <typename T1, typename T2>\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   const T1& val1, const T2& val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}\
+GTEST_API_ AssertionResult CmpHelper##op_name(\
+    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// Implements the helper function for {ASSERT|EXPECT}_NE
+GTEST_IMPL_CMP_HELPER_(NE, !=);
+// Implements the helper function for {ASSERT|EXPECT}_LE
+GTEST_IMPL_CMP_HELPER_(LE, <=);
+// Implements the helper function for {ASSERT|EXPECT}_LT
+GTEST_IMPL_CMP_HELPER_(LT, <);
+// Implements the helper function for {ASSERT|EXPECT}_GE
+GTEST_IMPL_CMP_HELPER_(GE, >=);
+// Implements the helper function for {ASSERT|EXPECT}_GT
+GTEST_IMPL_CMP_HELPER_(GT, >);
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                                          const char* actual_expression,
+                                          const char* expected,
+                                          const char* actual);
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
+                                              const char* actual_expression,
+                                              const char* expected,
+                                              const char* actual);
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
+
+
+// Helper function for *_STREQ on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                                          const char* actual_expression,
+                                          const wchar_t* expected,
+                                          const wchar_t* actual);
+
+// Helper function for *_STRNE on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
+
+}  // namespace internal
+
+// IsSubstring() and IsNotSubstring() are intended to be used as the
+// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by
+// themselves.  They check whether needle is a substring of haystack
+// (NULL is considered a substring of itself only), and return an
+// appropriate error message when they fail.
+//
+// The {needle,haystack}_expr arguments are the stringified
+// expressions that generated the two real arguments.
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+// Helper template function for comparing floating-points.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename RawType>
+AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression,
+                                         const char* actual_expression,
+                                         RawType expected,
+                                         RawType actual) {
+  const FloatingPoint<RawType> lhs(expected), rhs(actual);
+
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  ::std::stringstream expected_ss;
+  expected_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+              << expected;
+
+  ::std::stringstream actual_ss;
+  actual_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+            << actual;
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   StringStreamToString(&expected_ss),
+                   StringStreamToString(&actual_ss),
+                   false);
+}
+
+// Helper function for implementing ASSERT_NEAR.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
+                                                const char* expr2,
+                                                const char* abs_error_expr,
+                                                double val1,
+                                                double val2,
+                                                double abs_error);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// A class that enables one to stream messages to assertion macros
+class GTEST_API_ AssertHelper {
+ public:
+  // Constructor.
+  AssertHelper(TestPartResult::Type type,
+               const char* file,
+               int line,
+               const char* message);
+  ~AssertHelper();
+
+  // Message assignment is a semantic trick to enable assertion
+  // streaming; see the GTEST_MESSAGE_ macro below.
+  void operator=(const Message& message) const;
+
+ private:
+  // We put our data in a struct so that the size of the AssertHelper class can
+  // be as small as possible.  This is important because gcc is incapable of
+  // re-using stack space even for temporary variables, so every EXPECT_EQ
+  // reserves stack space for another AssertHelper.
+  struct AssertHelperData {
+    AssertHelperData(TestPartResult::Type t,
+                     const char* srcfile,
+                     int line_num,
+                     const char* msg)
+        : type(t), file(srcfile), line(line_num), message(msg) { }
+
+    TestPartResult::Type const type;
+    const char* const file;
+    int const line;
+    std::string const message;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+  };
+
+  AssertHelperData* const data_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+};
+
+}  // namespace internal
+
+#if GTEST_HAS_PARAM_TEST
+// The pure interface class that all value-parameterized tests inherit from.
+// A value-parameterized class must inherit from both ::testing::Test and
+// ::testing::WithParamInterface. In most cases that just means inheriting
+// from ::testing::TestWithParam, but more complicated test hierarchies
+// may need to inherit from Test and WithParamInterface at different levels.
+//
+// This interface has support for accessing the test parameter value via
+// the GetParam() method.
+//
+// Use it with one of the parameter generator defining functions, like Range(),
+// Values(), ValuesIn(), Bool(), and Combine().
+//
+// class FooTest : public ::testing::TestWithParam<int> {
+//  protected:
+//   FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual ~FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void SetUp() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void TearDown {
+//     // Can use GetParam() here.
+//   }
+// };
+// TEST_P(FooTest, DoesBar) {
+//   // Can use GetParam() method here.
+//   Foo foo;
+//   ASSERT_TRUE(foo.DoesBar(GetParam()));
+// }
+// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
+
+template <typename T>
+class WithParamInterface {
+ public:
+  typedef T ParamType;
+  virtual ~WithParamInterface() {}
+
+  // The current parameter value. Is also available in the test fixture's
+  // constructor. This member function is non-static, even though it only
+  // references static data, to reduce the opportunity for incorrect uses
+  // like writing 'WithParamInterface<bool>::GetParam()' for a test that
+  // uses a fixture whose parameter type is int.
+  const ParamType& GetParam() const {
+    GTEST_CHECK_(parameter_ != NULL)
+        << "GetParam() can only be called inside a value-parameterized test "
+        << "-- did you intend to write TEST_P instead of TEST_F?";
+    return *parameter_;
+  }
+
+ private:
+  // Sets parameter value. The caller is responsible for making sure the value
+  // remains alive and unchanged throughout the current test.
+  static void SetParam(const ParamType* parameter) {
+    parameter_ = parameter;
+  }
+
+  // Static value used for accessing parameter during a test lifetime.
+  static const ParamType* parameter_;
+
+  // TestClass must be a subclass of WithParamInterface<T> and Test.
+  template <class TestClass> friend class internal::ParameterizedTestFactory;
+};
+
+template <typename T>
+const T* WithParamInterface<T>::parameter_ = NULL;
+
+// Most value-parameterized classes can ignore the existence of
+// WithParamInterface, and can just inherit from ::testing::TestWithParam.
+
+template <typename T>
+class TestWithParam : public Test, public WithParamInterface<T> {
+};
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Macros for indicating success/failure in test code.
+
+// ADD_FAILURE unconditionally adds a failure to the current test.
+// SUCCEED generates a success - it doesn't automatically make the
+// current test successful, as a test is only successful when it has
+// no failure.
+//
+// EXPECT_* verifies that a certain condition is satisfied.  If not,
+// it behaves like ADD_FAILURE.  In particular:
+//
+//   EXPECT_TRUE  verifies that a Boolean condition is true.
+//   EXPECT_FALSE verifies that a Boolean condition is false.
+//
+// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
+// that they will also abort the current function on failure.  People
+// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
+// writing data-driven tests often find themselves using ADD_FAILURE
+// and EXPECT_* more.
+
+// Generates a nonfatal failure with a generic message.
+#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
+
+// Generates a nonfatal failure at the given source file location with
+// a generic message.
+#define ADD_FAILURE_AT(file, line) \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kNonFatalFailure)
+
+// Generates a fatal failure with a generic message.
+#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
+
+// Define this macro to 1 to omit the definition of FAIL(), which is a
+// generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_FAIL
+# define FAIL() GTEST_FAIL()
+#endif
+
+// Generates a success with a generic message.
+#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
+
+// Define this macro to 1 to omit the definition of SUCCEED(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_SUCCEED
+# define SUCCEED() GTEST_SUCCEED()
+#endif
+
+// Macros for testing exceptions.
+//
+//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
+//         Tests that the statement throws the expected exception.
+//    * {ASSERT|EXPECT}_NO_THROW(statement):
+//         Tests that the statement doesn't throw any exception.
+//    * {ASSERT|EXPECT}_ANY_THROW(statement):
+//         Tests that the statement throws an exception.
+
+#define EXPECT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
+#define ASSERT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
+#define ASSERT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
+
+// Boolean assertions. Condition can be either a Boolean expression or an
+// AssertionResult. For more information on how to use AssertionResult with
+// these macros see comments on that class.
+#define EXPECT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_NONFATAL_FAILURE_)
+#define EXPECT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_NONFATAL_FAILURE_)
+#define ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_FATAL_FAILURE_)
+#define ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_FATAL_FAILURE_)
+
+// Includes the auto-generated header that implements a family of
+// generic predicate assertion macros.
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
+//
+// Implements a family of generic predicate assertion macros.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+// Makes sure this header is not included before gtest.h.
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
+// Please email googletestframework@googlegroups.com if you need
+// support for higher arities.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ; \
+  else \
+    on_failure(gtest_ar.failure_message())
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text,
+                                  const char* e1,
+                                  Pred pred,
+                                  const T1& v1) {
+  if (pred(v1)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
+                                             #v1, \
+                                             pred, \
+                                             v1), on_failure)
+
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             pred, \
+                                             v1, \
+                                             v2), on_failure)
+
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3), on_failure)
+
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4), on_failure)
+
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  const char* e5,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4,
+                                  const T5& v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ", "
+                            << e5 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4
+                            << "\n" << e5 << " evaluates to " << v5;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             #v5, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4, \
+                                             v5), on_failure)
+
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+
+
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+// Macros for testing equalities and inequalities.
+//
+//    * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual
+//    * {ASSERT|EXPECT}_NE(v1, v2):           Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2):           Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2):           Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2):           Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2):           Tests that v1 >= v2
+//
+// When they are not, Google Test prints both the tested expressions and
+// their actual values.  The values must be compatible built-in types,
+// or you will get a compiler error.  By "compatible" we mean that the
+// values can be compared by the respective operator.
+//
+// Note:
+//
+//   1. It is possible to make a user-defined type work with
+//   {ASSERT|EXPECT}_??(), but that requires overloading the
+//   comparison operators and is thus discouraged by the Google C++
+//   Usage Guide.  Therefore, you are advised to use the
+//   {ASSERT|EXPECT}_TRUE() macro to assert that two objects are
+//   equal.
+//
+//   2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on
+//   pointers (in particular, C strings).  Therefore, if you use it
+//   with two C strings, you are testing how their locations in memory
+//   are related, not how their content is related.  To compare two C
+//   strings by content, use {ASSERT|EXPECT}_STR*().
+//
+//   3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to
+//   {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you
+//   what the actual value is when it fails, and similarly for the
+//   other comparisons.
+//
+//   4. Do not depend on the order in which {ASSERT|EXPECT}_??()
+//   evaluate their arguments, which is undefined.
+//
+//   5. These macros evaluate their arguments exactly once.
+//
+// Examples:
+//
+//   EXPECT_NE(5, Foo());
+//   EXPECT_EQ(NULL, a_pointer);
+//   ASSERT_LT(i, array_size);
+//   ASSERT_GT(records.size(), 0) << "There is no record left.";
+
+#define EXPECT_EQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
+                      expected, actual)
+#define EXPECT_NE(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual)
+#define EXPECT_LE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define EXPECT_LT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define EXPECT_GE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define EXPECT_GT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+#define GTEST_ASSERT_EQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
+                      expected, actual)
+#define GTEST_ASSERT_NE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define GTEST_ASSERT_LE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define GTEST_ASSERT_LT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define GTEST_ASSERT_GE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define GTEST_ASSERT_GT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of
+// ASSERT_XY(), which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_ASSERT_EQ
+# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_NE
+# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LE
+# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LT
+# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GE
+# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GT
+# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#endif
+
+// C-string Comparisons.  All tests treat NULL and any non-NULL string
+// as different.  Two NULLs are equal.
+//
+//    * {ASSERT|EXPECT}_STREQ(s1, s2):     Tests that s1 == s2
+//    * {ASSERT|EXPECT}_STRNE(s1, s2):     Tests that s1 != s2
+//    * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case
+//    * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case
+//
+// For wide or narrow string objects, you can use the
+// {ASSERT|EXPECT}_??() macros.
+//
+// Don't depend on the order in which the arguments are evaluated,
+// which is undefined.
+//
+// These macros evaluate their arguments exactly once.
+
+#define EXPECT_STREQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define EXPECT_STRNE(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define EXPECT_STRCASEEQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define EXPECT_STRCASENE(s1, s2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+#define ASSERT_STREQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define ASSERT_STRNE(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define ASSERT_STRCASEEQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define ASSERT_STRCASENE(s1, s2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+// Macros for comparing floating-point numbers.
+//
+//    * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual):
+//         Tests that two float values are almost equal.
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual):
+//         Tests that two double values are almost equal.
+//    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
+//         Tests that v1 and v2 are within the given distance to each other.
+//
+// Google Test uses ULP-based comparison to automatically pick a default
+// error bound that is appropriate for the operands.  See the
+// FloatingPoint template class in gtest-internal.h if you are
+// interested in the implementation details.
+
+#define EXPECT_FLOAT_EQ(expected, actual)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      expected, actual)
+
+#define EXPECT_DOUBLE_EQ(expected, actual)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      expected, actual)
+
+#define ASSERT_FLOAT_EQ(expected, actual)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      expected, actual)
+
+#define ASSERT_DOUBLE_EQ(expected, actual)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      expected, actual)
+
+#define EXPECT_NEAR(val1, val2, abs_error)\
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+#define ASSERT_NEAR(val1, val2, abs_error)\
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+// These predicate format functions work on floating-point values, and
+// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
+//
+//   EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0);
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
+                                   float val1, float val2);
+GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                                    double val1, double val2);
+
+
+#if GTEST_OS_WINDOWS
+
+// Macros that test for HRESULT failure and success, these are only useful
+// on Windows, and rely on Windows SDK macros and APIs to compile.
+//
+//    * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr)
+//
+// When expr unexpectedly fails or succeeds, Google Test prints the
+// expected result and the actual result with both a human-readable
+// string representation of the error, if available, as well as the
+// hex result code.
+# define EXPECT_HRESULT_SUCCEEDED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define ASSERT_HRESULT_SUCCEEDED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define EXPECT_HRESULT_FAILED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+# define ASSERT_HRESULT_FAILED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+#endif  // GTEST_OS_WINDOWS
+
+// Macros that execute statement and check that it doesn't generate new fatal
+// failures in the current thread.
+//
+//   * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement);
+//
+// Examples:
+//
+//   EXPECT_NO_FATAL_FAILURE(Process());
+//   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
+//
+#define ASSERT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+#define EXPECT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+
+// Causes a trace (including the source file path, the current line
+// number, and the given message) to be included in every test failure
+// message generated by code in the current scope.  The effect is
+// undone when the control leaves the current scope.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// In the implementation, we include the current line number as part
+// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
+// to appear in the same block - as long as they are on different
+// lines.
+#define SCOPED_TRACE(message) \
+  ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
+    __FILE__, __LINE__, ::testing::Message() << (message))
+
+// Compile-time assertion for type equality.
+// StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
+// the same type.  The value it returns is not interesting.
+//
+// Instead of making StaticAssertTypeEq a class template, we make it a
+// function template that invokes a helper class template.  This
+// prevents a user from misusing StaticAssertTypeEq<T1, T2> by
+// defining objects of that type.
+//
+// CAVEAT:
+//
+// When used inside a method of a class template,
+// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is
+// instantiated.  For example, given:
+//
+//   template <typename T> class Foo {
+//    public:
+//     void Bar() { testing::StaticAssertTypeEq<int, T>(); }
+//   };
+//
+// the code:
+//
+//   void Test1() { Foo<bool> foo; }
+//
+// will NOT generate a compiler error, as Foo<bool>::Bar() is never
+// actually instantiated.  Instead, you need:
+//
+//   void Test2() { Foo<bool> foo; foo.Bar(); }
+//
+// to cause a compiler error.
+template <typename T1, typename T2>
+bool StaticAssertTypeEq() {
+  (void)internal::StaticAssertTypeEqHelper<T1, T2>();
+  return true;
+}
+
+// Defines a test.
+//
+// The first parameter is the name of the test case, and the second
+// parameter is the name of the test within the test case.
+//
+// The convention is to end the test case name with "Test".  For
+// example, a test case for the Foo class can be named FooTest.
+//
+// The user should put his test code between braces after using this
+// macro.  Example:
+//
+//   TEST(FooTest, InitializesCorrectly) {
+//     Foo foo;
+//     EXPECT_TRUE(foo.StatusIsOK());
+//   }
+
+// Note that we call GetTestTypeId() instead of GetTypeId<
+// ::testing::Test>() here to get the type ID of testing::Test.  This
+// is to work around a suspected linker bug when using Google Test as
+// a framework on Mac OS X.  The bug causes GetTypeId<
+// ::testing::Test>() to return different values depending on whether
+// the call is from the Google Test framework itself or from user test
+// code.  GetTestTypeId() is guaranteed to always return the same
+// value, as it always calls GetTypeId<>() from the Google Test
+// framework.
+#define GTEST_TEST(test_case_name, test_name)\
+  GTEST_TEST_(test_case_name, test_name, \
+              ::testing::Test, ::testing::internal::GetTestTypeId())
+
+// Define this macro to 1 to omit the definition of TEST(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_TEST
+# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name)
+#endif
+
+// Defines a test that uses a test fixture.
+//
+// The first parameter is the name of the test fixture class, which
+// also doubles as the test case name.  The second parameter is the
+// name of the test within the test case.
+//
+// A test fixture class must be declared earlier.  The user should put
+// his test code between braces after using this macro.  Example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     virtual void SetUp() { b_.AddElement(3); }
+//
+//     Foo a_;
+//     Foo b_;
+//   };
+//
+//   TEST_F(FooTest, InitializesCorrectly) {
+//     EXPECT_TRUE(a_.StatusIsOK());
+//   }
+//
+//   TEST_F(FooTest, ReturnsElementCountCorrectly) {
+//     EXPECT_EQ(0, a_.size());
+//     EXPECT_EQ(1, b_.size());
+//   }
+
+#define TEST_F(test_fixture, test_name)\
+  GTEST_TEST_(test_fixture, test_name, test_fixture, \
+              ::testing::internal::GetTypeId<test_fixture>())
+
+}  // namespace testing
+
+// Use this function in main() to run all tests.  It returns 0 if all
+// tests are successful, or 1 otherwise.
+//
+// RUN_ALL_TESTS() should be invoked after the command line has been
+// parsed by InitGoogleTest().
+//
+// This function was formerly a macro; thus, it is in the global
+// namespace and has an all-caps name.
+int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
+
+inline int RUN_ALL_TESTS() {
+  return ::testing::UnitTest::GetInstance()->Run();
+}
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_